In [1]:
!pip install pesq

Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: pesq
[33m  DEPRECATION: pesq is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m  Running setup.py install for pesq ... [?25ldone
[?25hSuccessfully installed pesq-0.0.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pickle
from pesq import pesq
import librosa
import os

In [2]:
# Encoder: Generates the watermark perturbation
class Encoder(nn.Module):
    def __init__(self, input_size):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, input_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        perturbation = torch.tanh(self.fc2(x)) * 0.01  # Scaled perturbation
        return perturbation

In [3]:
# Decoder: Detects the watermark
class Decoder(nn.Module):
    def __init__(self, input_size):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))

In [4]:
# Audio Dataset
class AudioDataset(Dataset):
    def __init__(self, audio_paths, max_length=96000):
        self.audio_paths = audio_paths
        self.max_length = max_length

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio, sr = librosa.load(self.audio_paths[idx], sr=16000, mono=True)
        audio = torch.from_numpy(audio).float()
        original_length = len(audio)
        if original_length < self.max_length:
            audio = torch.nn.functional.pad(audio, (0, self.max_length - original_length))
        else:
            audio = audio[:self.max_length]
        return audio, original_length

In [5]:
# Training Loop
def train(encoder, decoder, dataloader, device, num_epochs=10):
    optimizer_enc = optim.Adam(encoder.parameters(), lr=0.001)
    optimizer_dec = optim.Adam(decoder.parameters(), lr=0.001)
    criterion = nn.BCELoss()

    for epoch in range(num_epochs):
        total_dec_loss = 0.0
        total_enc_loss = 0.0
        num_batches = 0

        for audio, original_length in dataloader:
            audio = audio.to(device)
            batch_size = audio.size(0)
            num_batches += 1

            # Train Decoder
            optimizer_dec.zero_grad()
            pred_original = decoder(audio)  # Predict 0 for original audio
            loss_original = criterion(pred_original, torch.zeros(batch_size, 1).to(device))
            perturbation = encoder(audio)
            watermarked = audio + perturbation
            pred_watermarked = decoder(watermarked)  # Predict 1 for watermarked audio
            loss_watermarked = criterion(pred_watermarked, torch.ones(batch_size, 1).to(device))
            loss_dec = (loss_original + loss_watermarked) / 2
            loss_dec.backward()
            optimizer_dec.step()

            # Train Encoder
            optimizer_enc.zero_grad()
            pred_watermarked = decoder(watermarked)
            loss_detection = criterion(pred_watermarked, torch.ones(batch_size, 1).to(device))  # Ensure watermark is detectable
            loss_perceptual = torch.mean(perturbation ** 2)  # Minimize distortion (MSE proxy for PESQ)
            loss_enc = loss_detection + 0.1 * loss_perceptual  # Balance detection and quality
            loss_enc.backward()
            optimizer_enc.step()

            total_dec_loss += loss_dec.item()
            total_enc_loss += loss_enc.item()

        avg_dec_loss = total_dec_loss / num_batches
        avg_enc_loss = total_enc_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs}: Decoder Loss: {avg_dec_loss:.4f}, Encoder Loss: {avg_enc_loss:.4f}")

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
!pwd

/content


In [14]:
!ls

DataSourcing.ipynb    projectproposal.md    [1m[36mvenv[m[m
Encoder_Decoder.ipynb [1m[36msteganography-project[m[m
ideas.md              test.ipynb


In [6]:
path_lst = []
audio_dir = f"50_speakers_audio_data"
for folder in os.listdir(audio_dir):
    folder_path = os.path.join(audio_dir, folder)
    if os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith('.wav'):  # Ensure only audio files are included
                audio_path = os.path.join(folder_path, filename)
                path_lst.append(audio_path)

In [7]:
# Create Dataset and DataLoader
dataset = AudioDataset(path_lst, max_length=96000)  # 6 seconds at 16kHz
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)  # Batch size increased for efficiency

In [9]:
# Set Device and Initialize Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(96000).to(device)  # Input size fixed to 6 seconds at 16kHz
decoder = Decoder(96000).to(device)

# Train the Models
train(encoder, decoder, dataloader, device, num_epochs=10)

[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1632) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1600) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2144) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2688) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2720) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1600) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2112) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1600) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776]

Epoch 1/10: Decoder Loss: 0.2938, Encoder Loss: 0.5163


[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2688) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1632) too large for available bit count (1560)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2112) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2176) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1888) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1600) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (1600) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (2720) too large for available bit count (1568)
[src/libmpg123/layer3.c:INT123_do_layer3():1776]

KeyboardInterrupt: 

In [None]:
# Save Models (Optional)
torch.save(encoder.state_dict(), "drive/MyDrive/encoder.pth")
torch.save(decoder.state_dict(), "drive/MyDrive/decoder.pth")