In [1]:
import torch
import numpy as np
import os
import torchaudio
import librosa
from concurrent.futures import ThreadPoolExecutor
import torch.nn as nn
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score

In [2]:
x_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/train_rep.pt")
y_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/labels.pt")

  x_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/train_rep.pt")
  y_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/labels.pt")


In [3]:
print(f"Training data shape: {x_train.shape}")
print(f"Training labels shape: {y_train.shape}")
# input: [20550, 496, 768]
# 20550 -> number of samples
# 496 -> sequence length (sequence of timesteps)
# 768 -> number of features
# output: [20550, 527] -> 527 classes

Training data shape: torch.Size([20550, 496, 768])
Training labels shape: torch.Size([20550, 527])


In [19]:
y_train.dtype

torch.int64

In [5]:
# input_channels = 768  
# output_size = 527

In [20]:
class CNNClassifier(nn.Module):
    def __init__(self, input_channels=768, num_classes=527):
        super(CNNClassifier, self).__init__()
        
        # First Convolutional Block
        self.conv1 = nn.Sequential(
            nn.Conv1d(input_channels, 256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        # Second Convolutional Block
        self.conv2 = nn.Sequential(
            nn.Conv1d(256, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        # Third Convolutional Block
        self.conv3 = nn.Sequential(
            nn.Conv1d(128, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        #size of flattened features
        self.flatten_size = 64 * (496 // (2*2*2))
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(self.flatten_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, num_classes),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc(x)
        return x

In [21]:
def calculate_map(y_true, y_pred):
    n_classes = y_true.shape[1]
    average_precisions = []
    for i in range(n_classes):
        y_true_class = y_true[:, i]
        y_pred_class = y_pred[:, i]
        ap = average_precision_score(y_true_class, y_pred_class)
        average_precisions.append(ap)
    average_precisions_sorted = sorted(average_precisions, reverse=True)
    print("\nTop 5 class-wise Average Precisions:")
    for i, ap in enumerate(average_precisions_sorted[:5]):
        print(f"Class {i+1}: {ap:.4f}")
    
    return np.mean(average_precisions)

In [22]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device='cuda'):
    model = model.to(device)
    best_map = 0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_predictions = []
        val_targets = []
        val_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x = batch_x.float().to(device)
                batch_y = batch_y.float().to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
                val_predictions.extend(outputs.cpu().numpy())
                val_targets.extend(batch_y.cpu().numpy())
        
        # lists -> numpy arrays for metric calculation
        val_predictions = np.array(val_predictions)
        val_targets = np.array(val_targets)
        val_map = calculate_map(val_targets, val_predictions)
        val_f1 = f1_score(val_targets, (val_predictions > 0.5).astype(float), average='micro')
        print(f'\nEpoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        print(f'Validation MAP: {val_map:.4f}')
        print(f'Validation F1-Score: {val_f1:.4f}')
        
        # Save best model based on MAP
        if val_map > best_map:
            best_map = val_map
            torch.save(model.state_dict(), 'best_model_map.pth')
            print("New best model saved!")
        
        print('-' * 50)

In [24]:
batch_size = 32
learning_rate = 0.001
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
x_train_float = x_train.float()
y_train_float = y_train.float()

full_dataset = TensorDataset(x_train_float, y_train_float)
total_size = len(full_dataset)
train_size = (1 * total_size)
val_size = total_size - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size],generator=torch.Generator().manual_seed(42))
    
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Training samples: 20550
Validation samples: 0


In [25]:
model = CNNClassifier()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
train_model(model, train_loader, train_loader, criterion, optimizer, num_epochs, device)


Top 5 class-wise Average Precisions:
Class 1: 0.8848
Class 2: 0.8754
Class 3: 0.8561
Class 4: 0.8360
Class 5: 0.8247

Epoch 1/5:
Training Loss: 0.0259
Validation Loss: 0.0166
Validation MAP: 0.2020
Validation F1-Score: 0.2993
New best model saved!
--------------------------------------------------

Top 5 class-wise Average Precisions:
Class 1: 0.9316
Class 2: 0.9304
Class 3: 0.9265
Class 4: 0.9168
Class 5: 0.9041

Epoch 2/5:
Training Loss: 0.0172
Validation Loss: 0.0136
Validation MAP: 0.3328
Validation F1-Score: 0.3889
New best model saved!
--------------------------------------------------

Top 5 class-wise Average Precisions:
Class 1: 0.9907
Class 2: 0.9846
Class 3: 0.9658
Class 4: 0.9537
Class 5: 0.9499

Epoch 3/5:
Training Loss: 0.0152
Validation Loss: 0.0123
Validation MAP: 0.4040
Validation F1-Score: 0.4272
New best model saved!
--------------------------------------------------

Top 5 class-wise Average Precisions:
Class 1: 0.9909
Class 2: 0.9904
Class 3: 0.9839
Class 4: 0.965

In [26]:
x_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/test_rep.pt")
y_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/labels.pt")

  x_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/test_rep.pt")
  y_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/labels.pt")


In [27]:
test_dataset = TensorDataset(x_test, y_test)  
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [28]:
model.load_state_dict(torch.load("best_model_map.pth"))
model.eval()  

  model.load_state_dict(torch.load("best_model_map.pth"))


CNNClassifier(
  (conv1): Sequential(
    (0): Conv1d(768, 256, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=3968, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5,

In [29]:
test_predictions = []
test_targets = []  

with torch.no_grad():
    for batch_x, batch_y in test_loader: 
        batch_x = batch_x.float().to(device)
        outputs = model(batch_x)
        test_predictions.extend(outputs.cpu().numpy())
        if y_test is not None:
            test_targets.extend(batch_y.cpu().numpy())

In [30]:
test_predictions = np.array(test_predictions)
test_targets = np.array(test_targets) 

test_map = calculate_map(test_targets, test_predictions)
test_f1 = f1_score(test_targets, (test_predictions > 0.5).astype(float), average="micro")

print(f"Test MAP: {test_map:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")


Top 5 class-wise Average Precisions:
Class 1: 0.9180
Class 2: 0.9158
Class 3: 0.9060
Class 4: 0.8748
Class 5: 0.8452
Test MAP: 0.3339
Test F1-Score: 0.4109
