In [11]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.2.1%2Bcu121-cp310-cp310-linux_x86_64.whl (757.3 MB)
Collecting nvidia-nccl-cu12==2.19.3 (from torch)
  Using cached https://download.pytorch.org/whl/cu121/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
Collecting triton==2.2.0 (from torch)
  Using cached https://download.pytorch.org/whl/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.3.0%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.2.2%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.2.

In [2]:
import os
import torch
import librosa
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchaudio import load
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve
import torch.optim as optim

In [2]:
! pip install wandb

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl.metadata (10 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.0.1-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting appdirs>=1.4.3 (from wandb)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloadi

In [3]:
import wandb

In [4]:
wandb.init(project="Speech_Assignment_3", entity="iiserkbikram", name = "task 4")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33miiserkbikram[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
class AudioDataset(Dataset):
    def __init__(self, root):
        self.root = root
        self.classes, self.class_to_idx = self._find_classes()
        self.samples = self._make_dataset()

    def _find_classes(self):
        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def _make_dataset(self):
        samples = []
        for target_class in self.classes:
            class_index = self.class_to_idx[target_class]
            target_dir = os.path.join(self.root, target_class)
            for root_dir, _, file_names in os.walk(target_dir):
                for file_name in file_names:
                    if file_name.endswith('.wav') or file_name.endswith('.mp3') or file_name.endswith('.ogg'):
                        file_path = os.path.join(root_dir, file_name)
                        samples.append((file_path, class_index))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        audio_path, class_index = self.samples[idx]
        # Load the audio file and preprocess
        waveform, _ = load(audio_path)
        waveform = self._preprocess_audio(waveform)
        return waveform, class_index

    def _preprocess_audio(self, waveform):
        waveform = waveform.numpy()[0]  # Convert tensor to numpy array
        max_len = 64600
        if waveform.shape[0] >= max_len:
            return waveform[:max_len]
        else:
            num_repeats = int(max_len / waveform.shape[0]) + 1
            padded_waveform = np.tile(waveform, (1, num_repeats))[:, :max_len][0]
            return padded_waveform


In [6]:
# Define the root directory where your data is stored
root = "for-2sec/for-2seconds" 

# Create datasets
train_dataset = AudioDataset(root=os.path.join(root, "training"))
test_dataset = AudioDataset(root=os.path.join(root, "testing"))
validation_dataset = AudioDataset(root=os.path.join(root, "validation"))

In [7]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=6)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False, num_workers=6)

In [8]:
#GPU device
device = 'cuda' if torch.cuda.is_available() else 'cpu'                  
print('Device: {}'.format(device))

# Load the SSL W2V model trained for LA and DF tracks
from model import Model

model = Model(None, device=device)
model = nn.DataParallel(model).to(device)
model.load_state_dict(torch.load('Best_LA_model_for_DF.pth'))

model.eval()

Device: cuda


2024-05-03 18:43:16 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


DataParallel(
  (module): Model(
    (ssl_model): SSLModel(
      (model): Wav2Vec2Model(
        (feature_extractor): ConvFeatureExtractionModel(
          (conv_layers): ModuleList(
            (0): Sequential(
              (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU(approximate='none')
            )
            (1-4): 4 x Sequential(
              (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
              (1): Dropout(p=0.0, inplace=False)
              (2): Sequential(
                (0): TransposeLast()
                (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True)
                (2): TransposeLast()
              )
              (3): GELU(approximate='none')
            )
     

In [9]:
torch.cuda.empty_cache()

In [10]:
def train_epoch(train_loader, model, lr, optimizer, device):
    running_loss = 0.0
    num_total = 0.0
    num_batches = len(train_loader)
    
    model.train()

    # Set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    criterion = nn.CrossEntropyLoss(weight=weight)
    
    # Initialize tqdm with the length of the train_loader
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    
    for batch_x, batch_y in progress_bar:
        batch_size = batch_x.size(0)
        num_total += batch_size
        
        batch_x = batch_x.to(device)
        batch_y = batch_y.view(-1).type(torch.int64).to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        batch_out = model(batch_x)
        
        # Compute loss
        batch_loss = criterion(batch_out, batch_y)
        
        running_loss += batch_loss.item() * batch_size
        
        # Backward pass and optimization
        batch_loss.backward()
        optimizer.step()
        
        # Update progress bar
        progress_bar.set_postfix(loss=running_loss / num_batches)
    
    progress_bar.close()
    
    # Compute average loss
    running_loss /= len(train_loader.dataset)

    wandb.log({"train_loss": running_loss})
    
    return running_loss


In [11]:
def validate(val_loader, model, device):
    running_loss = 0.0
    num_total = 0.0
    num_batches = len(val_loader)
    
    model.eval()

    # Set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    criterion = nn.CrossEntropyLoss(weight=weight)
    
    # Initialize tqdm with the length of the val_loader
    progress_bar = tqdm(val_loader, desc='Validation', leave=False)
    
    with torch.no_grad():
        for batch_x, batch_y in progress_bar:
            batch_size = batch_x.size(0)
            num_total += batch_size
            
            batch_x = batch_x.to(device)
            batch_y = batch_y.view(-1).type(torch.int64).to(device)
            
            # Forward pass
            batch_out = model(batch_x)
            
            # Compute loss
            batch_loss = criterion(batch_out, batch_y)
            
            running_loss += batch_loss.item() * batch_size
            
            # Update progress bar
            progress_bar.set_postfix(loss=running_loss / num_batches)
    
    progress_bar.close()
    
    # Compute average loss
    running_loss /= len(val_loader.dataset)
    
    # Log validation loss to wandb
    wandb.log({"val_loss": running_loss})
    
    return running_loss

In [12]:
lr = 5e-5

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

# Number of epochs
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    # Run training epoch
    train_loss = train_epoch(train_loader, model, lr, optimizer, device)
    
    # Run validation epoch
    val_loss = validate(validation_loader, model, device)
    
    # Print training and validation losses for each epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

                                                                         

Epoch [1/10], Train Loss: 0.0430, Val Loss: 0.0089


                                                                         

Epoch [2/10], Train Loss: 0.0048, Val Loss: 0.0683


                                                                          

Epoch [3/10], Train Loss: 0.0115, Val Loss: 0.0034


                                                                          

Epoch [4/10], Train Loss: 0.0076, Val Loss: 0.0015


                                                                          

Epoch [5/10], Train Loss: 0.0116, Val Loss: 0.0083


                                                                           

Epoch [6/10], Train Loss: 0.0076, Val Loss: 0.0013


                                                                           

Epoch [7/10], Train Loss: 0.0007, Val Loss: 0.0033


                                                                           

Epoch [8/10], Train Loss: 0.0075, Val Loss: 0.0019


                                                                          

Epoch [9/10], Train Loss: 0.0029, Val Loss: 0.0068


                                                                          

Epoch [10/10], Train Loss: 0.0096, Val Loss: 0.0026




In [13]:
# Save the final model
torch.save(model.state_dict(), "finetuned_model.pth")

In [14]:
from sklearn.metrics import roc_auc_score, roc_curve
from scipy.optimize import brentq
from scipy.interpolate import interp1d
import numpy as np
import torch
from tqdm import tqdm

# Lists to store true labels and predicted scores
true_labels = []
predicted_scores = []

# Set the model to evaluation mode
model.eval()

# Iterate through the test_loader to get true labels and predicted scores
with torch.no_grad():
    for data, target in tqdm(test_loader, desc="Testing"):
        # Assuming your model outputs probabilities or scores
        output = model(data)
        predicted_scores.extend(output[:, 1].cpu().numpy())  # Probability of positive class
        true_labels.extend(target.cpu().numpy())

# Convert lists to numpy arrays
true_labels = np.array(true_labels)
predicted_scores = np.array(predicted_scores)

# Calculate AUC
auc_score = roc_auc_score(true_labels, predicted_scores)

# Calculate EER
fpr, tpr, thresholds = roc_curve(true_labels, predicted_scores, pos_label=1)
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
threshold = interp1d(fpr, thresholds)(eer)

print("AUC:", auc_score)
print("EER:", eer)
print("Threshold at EER:", threshold)

Testing: 100%|██████████| 68/68 [00:25<00:00,  2.69it/s]

AUC: 0.6946501946366782
EER: 0.3768382352941032
Threshold at EER: 2.5974647998809925



