In [1]:
import torch
import numpy as np
import os
import torch.nn as nn
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split, Subset
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
import pickle

print("imports successful!")

imports successful!


In [2]:
x_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/train_rep.pt") 
y_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/labels.pt")
x_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/test_rep.pt")
y_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/labels.pt")

  x_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/train_rep.pt")
  y_train = torch.load("/scratch/gd2574/AudioSet-classification/Data/train/labels.pt")
  x_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/test_rep.pt")
  y_test = torch.load("/scratch/gd2574/AudioSet-classification/Data/test/labels.pt")


In [3]:
print(x_train.shape)
print(x_test.shape)

torch.Size([20550, 496, 768])
torch.Size([18886, 496, 768])


# Data loaders

In [4]:
def create_stratified_split(x_data, y_data, random_state=42, fold_idx=0):
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    indices = np.arange(len(y_data))
    splits = list(mskf.split(indices, y_data))
    if fold_idx >= len(splits): raise ValueError(f"fold_idx {fold_idx} out of range for {len(splits)} splits")
    return splits[fold_idx]

# Helper functions

In [5]:
def calculate_map(y_true, y_pred):
    n_classes = y_true.shape[1]
    average_precisions = []
    for i in range(n_classes):
        y_true_class = y_true[:, i]
        y_pred_class = y_pred[:, i]
        ap = average_precision_score(y_true_class, y_pred_class)
        average_precisions.append(ap)
    average_precisions_sorted = sorted(average_precisions, reverse=True)
    print("\nTop 5 class-wise Average Precisions:")
    for i, ap in enumerate(average_precisions_sorted[:5]):
        print(f"Class {i+1}: {ap:.4f}")
    
    return np.mean(average_precisions)

# Train model

In [6]:
def train_mlp_model(model, optimizer, criterion, train_loader, val_loader, transform_type='mean', num_epochs=10):
    print(f"\nTraining MLP with {transform_type} transformation")
    model = model.to(device)
    scaler = torch.amp.GradScaler('cuda')  
    best_map = 0
    
    for epoch in range(num_epochs):
        # Training 
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'): 
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                
            if torch.isnan(loss):
                print(f"NaN loss detected at epoch {epoch}")
                continue
                
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            
            train_loss += loss.item()
        
        # Validation 
        model.eval()
        val_predictions = []
        val_targets = []
        val_loss = 0
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x = batch_x.float().to(device)
                batch_y = batch_y.float().to(device)
                
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
                val_predictions.extend(outputs.cpu().numpy())
                val_targets.extend(batch_y.cpu().numpy())
        
        val_predictions = np.array(val_predictions)
        val_targets = np.array(val_targets)
        val_map = calculate_map(val_targets, val_predictions)
        val_f1 = f1_score(val_targets, (val_predictions > 0.5).astype(float), average='micro')
        
        print(f'\nEpoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        print(f'Validation MAP: {val_map:.4f}')
        print(f'Validation F1-Score: {val_f1:.4f}')
        
        if val_map > best_map:
            best_map = val_map
            torch.save(model.state_dict(), f'best_model_mlp_{transform_type}_firstlayer.pth')
            print("New best model saved!")
        
        print('-' * 50)
    
    return model, best_map

# Test model

In [7]:
def test_mlp_model(model, transform_type):
    print(f"\nTesting model with {transform_type} transformation")
    device = next(model.parameters()).device
    
    test_dataset = TensorDataset(x_test.float(), y_test.float())
    test_loader = DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        pin_memory=torch.cuda.is_available()
    )

    model_path = f'best_model_mlp_{transform_type}_firstlayer.pth'
    try:
        model.load_state_dict(torch.load(model_path))
        print(f"Successfully loaded model from {model_path}")
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None
    
    model.eval()
    test_predictions = []
    test_targets = []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            
            outputs = model(batch_x)
            test_predictions.extend(outputs.cpu().numpy())
            test_targets.extend(batch_y.cpu().numpy())
    
    test_predictions = np.array(test_predictions)
    test_targets = np.array(test_targets)
    
    test_map = calculate_map(test_targets, test_predictions)
    test_f1 = f1_score(test_targets, (test_predictions > 0.5).astype(float), average="micro")
    
    print(f"Test MAP: {test_map:.4f}")
    print(f"Test F1-Score: {test_f1:.4f}")
    print("-" * 50)
    
    return test_map, test_f1

# MLP

In [8]:
def transform_data(x_data, transform_type, n_components=1024):
    if transform_type == 'mean':
        return torch.mean(x_data, dim=1)
    elif transform_type == 'max':
        return torch.max(x_data, dim=1)[0]
    elif transform_type == 'append':
        from sklearn.decomposition import PCA
        x_flat = x_data.view(x_data.size(0), -1)
        x_numpy = x_flat.cpu().numpy() if x_flat.is_cuda else x_flat.numpy()
        pca = PCA(n_components=n_components)
        x_reduced = pca.fit_transform(x_numpy)
        return torch.tensor(x_reduced, dtype=x_data.dtype, device=x_data.device)

In [9]:
class MLPWithTransform(nn.Module):
    def __init__(self, input_size=768, hidden_size=1024, num_classes=527, transform_type='mean', n_components=None):
        super(MLPWithTransform, self).__init__()
        self.transform_type = transform_type
        # self.n_components = n_components
        self.input_size = input_size

        if transform_type == 'append':
            self.n_components = min(n_components or input_size, input_size)
            self.register_buffer('pca_mean', None)
            self.register_buffer('pca_components', None)
            first_layer_input = self.n_components
        else:
            self.n_components = None
            first_layer_input = input_size
            
        self.layers = nn.Sequential(
            nn.Linear(first_layer_input, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size // 2),
            nn.Dropout(0.3),
            nn.Linear(hidden_size // 2, num_classes)
        )
        
        self.is_pca_fitted = False
    
    def load_pca(self, pca_path):
        if self.transform_type == 'append':
            try:
                with open(pca_path, 'rb') as f:
                    pca_data = pickle.load(f)
                if not isinstance(pca_data, dict) or 'mean' not in pca_data or 'components' not in pca_data:
                    raise ValueError("Invalid PCA file format")
                self.register_buffer('pca_mean', torch.from_numpy(pca_data['mean']).float())
                self.register_buffer('pca_components', torch.from_numpy(pca_data['components']).float())
                self.is_pca_fitted = True
                print("PCA components loaded successfully!")
            except (EOFError, ValueError) as e:
                print(f"Error loading PCA file: {str(e)}")
                print("Will create new PCA file...")
                return False
            # except Exception as e:
            #     print(f"Unexpected error loading PCA: {str(e)}")
            #     print("Will create new PCA file...")
            #     return False
            return True
    
    def fit_pca(self, x, save_path=None):
        if self.transform_type == 'append':
            with torch.no_grad():
                # x_flat = x.view(x.size(0), -1) 
                # x_numpy = x_flat.cpu().numpy() if x_flat.is_cuda else x_flat.numpy()
                # First flatten the sequence dimension into the batch dimension
                batch_size, seq_len, feat_dim = x.size()
                x_reshaped = x.reshape(-1, feat_dim)  # Combine batch and sequence dims
                # Now perform PCA
                x_numpy = x_reshaped.cpu().numpy() if x_reshaped.is_cuda else x_reshaped.numpy()
                pca = PCA(n_components=self.n_components)
                pca.fit(x_numpy)
                
                self.register_buffer('pca_mean', torch.from_numpy(pca.mean_).float())
                self.register_buffer('pca_components', torch.from_numpy(pca.components_).float())
                self.is_pca_fitted = True
                
                if save_path:
                    pca_data = {
                        'mean': pca.mean_,
                        'components': pca.components_
                    }
                    with open(save_path, 'wb') as f:
                        pickle.dump(pca_data, f)
                    print(f"PCA components saved to {save_path}")
    
    def transform_sequence(self, x):
        if self.transform_type == 'mean':
            return torch.mean(x, dim=1)
        elif self.transform_type == 'max':
            return torch.max(x, dim=1)[0]
        elif self.transform_type == 'append':
            if not self.is_pca_fitted:
                raise RuntimeError("PCA must be fitted before transform. Call fit_pca first.")
            batch_size, seq_len, feat_dim = x.size()
            x_reshaped = x.reshape(-1, feat_dim)  # [batch_size * seq_len, feat_dim]
            x_centered = x_reshaped - self.pca_mean
            x_transformed = torch.mm(x_centered, self.pca_components.t())
            x_transformed = x_transformed.reshape(batch_size, seq_len, -1)
            return torch.mean(x_transformed, dim=1)  # [batch_size, n_components]
        else:
            raise ValueError(f"Unknown transform type: {self.transform_type}")
    
    def forward(self, x):
        x = self.transform_sequence(x)
        return self.layers(x)

# Run

In [10]:
batch_size = 32
learning_rate = 0.001
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x_train_float = x_train.float()
y_train_float = y_train.float()
full_dataset = TensorDataset(x_train_float, y_train_float)
total_size = len(full_dataset)
train_indices, val_indices = create_stratified_split(x_train_float, y_train_float.numpy())
train_dataset = Subset(full_dataset, train_indices)
val_dataset = Subset(full_dataset, val_indices)

print(f"Total samples: {len(full_dataset)}")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# print("x_train_float shape:", x_train_float.shape)
# print("x_train_float strides:", x_train_float.stride())

train_loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True, pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_dataset, batch_size=batch_size, pin_memory=torch.cuda.is_available())

transforms = [ 'mean', 'max', 'append' ] 
results = {}

Total samples: 20550
Training samples: 16440
Validation samples: 4110


In [11]:
for transform in transforms:
    print(f"\nProcessing {transform} transform...")
    
    model = MLPWithTransform(transform_type=transform)
    
    if transform == 'append':
        pca_path = 'pca_components.pkl'
        if os.path.exists(pca_path):
            print("Loading saved PCA components...")
            model.load_pca(pca_path)
        else:
            print("Fitting new PCA...")
            model.fit_pca(x_train_float, save_path=pca_path)
    
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()
    
    model, best_map = train_mlp_model(model, optimizer, criterion, train_loader, val_loader, transform_type=transform, num_epochs=num_epochs)   


Processing mean transform...

Training MLP with mean transformation

Top 5 class-wise Average Precisions:
Class 1: 0.8837
Class 2: 0.8723
Class 3: 0.8411
Class 4: 0.7843
Class 5: 0.7817

Epoch 1/10:
Training Loss: 0.2099
Validation Loss: 0.0177
Validation MAP: 0.1698
Validation F1-Score: 0.2625
New best model saved!
--------------------------------------------------

Top 5 class-wise Average Precisions:
Class 1: 0.9924
Class 2: 0.9552
Class 3: 0.9507
Class 4: 0.8979
Class 5: 0.8609

Epoch 2/10:
Training Loss: 0.0164
Validation Loss: 0.0140
Validation MAP: 0.3263
Validation F1-Score: 0.3369
New best model saved!
--------------------------------------------------

Top 5 class-wise Average Precisions:
Class 1: 1.0000
Class 2: 0.9396
Class 3: 0.9393
Class 4: 0.9278
Class 5: 0.9233

Epoch 3/10:
Training Loss: 0.0140
Validation Loss: 0.0127
Validation MAP: 0.3961
Validation F1-Score: 0.4036
New best model saved!
--------------------------------------------------

Top 5 class-wise Average Pr

In [12]:
for transform in transforms:
    test_model = MLPWithTransform(transform_type=transform)
    
    if transform == 'append':
        pca_path = 'pca_components.pkl'
        if os.path.exists(pca_path):
            test_model.load_pca(pca_path)
    
    test_model = test_model.to(device)
    test_map, test_f1 = test_mlp_model(test_model, transform_type=transform)


Testing model with mean transformation
Successfully loaded model from best_model_mlp_mean_firstlayer.pth


  model.load_state_dict(torch.load(model_path))



Top 5 class-wise Average Precisions:
Class 1: 0.9086
Class 2: 0.9017
Class 3: 0.9009
Class 4: 0.9006
Class 5: 0.8838
Test MAP: 0.3905
Test F1-Score: 0.4605
--------------------------------------------------

Testing model with max transformation
Successfully loaded model from best_model_mlp_max_firstlayer.pth


  model.load_state_dict(torch.load(model_path))



Top 5 class-wise Average Precisions:
Class 1: 0.8724
Class 2: 0.8703
Class 3: 0.8488
Class 4: 0.8414
Class 5: 0.8194
Test MAP: 0.3234
Test F1-Score: 0.3822
--------------------------------------------------
PCA components loaded successfully!

Testing model with append transformation
Successfully loaded model from best_model_mlp_append_firstlayer.pth


  model.load_state_dict(torch.load(model_path))



Top 5 class-wise Average Precisions:
Class 1: 0.9122
Class 2: 0.9058
Class 3: 0.9042
Class 4: 0.8999
Class 5: 0.8959
Test MAP: 0.3929
Test F1-Score: 0.4836
--------------------------------------------------
