In [1]:
!pip install soundata

Collecting soundata
  Downloading soundata-1.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting jams>=0.3.4 (from soundata)
  Downloading jams-0.3.5-py3-none-any.whl.metadata (4.5 kB)
Collecting py7zr>=0.16.0 (from soundata)
  Downloading py7zr-1.0.0-py3-none-any.whl.metadata (17 kB)
Collecting mir_eval>=0.8.2 (from jams>=0.3.4->soundata)
  Downloading mir_eval-0.8.2-py3-none-any.whl.metadata (3.0 kB)
Collecting texttable (from py7zr>=0.16.0->soundata)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pyzstd>=0.16.1 (from py7zr>=0.16.0->soundata)
  Downloading pyzstd-0.18.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.6 kB)
Collecting pyppmd<1.3.0,>=1.1.0 (from py7zr>=0.16.0->soundata)
  Downloading pyppmd-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr>=0.16.0->soundata)
  Downloading pybcj-1.0.7-cp312-cp312-manylinux2014_x86_64.man

In [2]:
# IMPORTS AND SETUP

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import soundfile as sf
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
from torch.utils.data import TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.models as models

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             confusion_matrix, classification_report,
                             matthews_corrcoef, roc_curve, auc, roc_auc_score)
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.model_selection import GridSearchCV

import os
import random
import pickle
from tqdm import tqdm
import soundata
from torch.cuda.amp import autocast, GradScaler

In [3]:

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using Device: {device}')


Using Device: cuda


In [4]:
# FEATURE EXTRACTION AND CACHING

class FeatureExtractor:
    def __init__(self, cache_dir='./feature_cache'):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)

    def extract_mfcc_features(self, y, sr, n_mfcc=40):
        """Extract MFCC features and compute statistics"""
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        # Compute statistics
        features = []
        for feat in [mfcc, mfcc_delta, mfcc_delta2]:
            features.extend([
                np.mean(feat, axis=1),
                np.std(feat, axis=1),
                np.max(feat, axis=1),
                np.min(feat, axis=1)
            ])

        return np.concatenate(features)

    def extract_mel_spectrogram(self, y, sr, n_mels=128):
        """Extract mel spectrogram for CNN models"""
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels,
                                                   n_fft=2048, hop_length=512)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-6)
        return mel_spec_db

    def extract_mfcc_sequence(self, y, sr, n_mfcc=40):
        """Extract MFCC sequence for 1D CNN"""
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return mfcc

    def process_audio(self, audio_path, sr=22050, duration=4):
        """Load and preprocess audio"""
        y, _ = librosa.load(audio_path, sr=sr, duration=duration)

        # Pad or truncate
        target_length = sr * duration
        if len(y) < target_length:
            y = np.pad(y, (0, target_length - len(y)), mode='constant')
        else:
            y = y[:target_length]

        return y, sr

    def extract_all_features(self, metadata, soundata_loader, feature_type='mfcc_stats'):
        """Extract features for all samples with caching"""
        cache_file = self.cache_dir / f'{feature_type}_features.pkl'

        if cache_file.exists():
            print(f'Loading Cached Features From {cache_file}')
            with open(cache_file, 'rb') as f:
                return pickle.load(f)

        print(f'Extracting {feature_type} Features')
        features = []
        labels = []

        for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
            clip = soundata_loader.clip(row['slice_file_name'])
            y, sr = clip.audio

            # Ensure consistent length
            target_length = sr * 4
            if len(y) < target_length:
                y = np.pad(y, (0, target_length - len(y)), mode='constant')
            else:
                y = y[:target_length]

            if feature_type == 'mfcc_stats':
                feat = self.extract_mfcc_features(y, sr)
            elif feature_type == 'mel_spectrogram':
                feat = self.extract_mel_spectrogram(y, sr)
            elif feature_type == 'mfcc_sequence':
                feat = self.extract_mfcc_sequence(y, sr)

            features.append(feat)
            labels.append(row['class'])

        result = {'features': features, 'labels': labels}

        # Cache the features
        with open(cache_file, 'wb') as f:
            pickle.dump(result, f)

        print(f'Features Cached To {cache_file}')
        return result

In [5]:
# DATASET CLASSES

class UrbanSoundMelDataset(Dataset):
    """Dataset for 2D CNN (Mel Spectrograms)"""
    def __init__(self, metadata, soundata_loader, augment=False):
        self.metadata = metadata.reset_index(drop=True)
        self.soundata_loader = soundata_loader
        self.augment = augment
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.metadata['class'])

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        clip = self.soundata_loader.clip(row['slice_file_name'])
        y, sr = clip.audio

        # Apply augmentations
        if self.augment:
            if random.random() > 0.5:
                y = librosa.effects.pitch_shift(y, sr=sr, n_steps=random.randint(-3, 3))
            if random.random() > 0.5:
                y = librosa.effects.time_stretch(y, rate=random.uniform(0.8, 1.2))
            if random.random() > 0.5:
                noise = np.random.randn(len(y))
                y = y + random.uniform(0.001, 0.005) * noise

        # Pad or truncate
        target_length = sr * 4
        if len(y) < target_length:
            y = np.pad(y, (0, target_length - len(y)), mode='constant')
        else:
            y = y[:target_length]

        # Extract mel spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,
                                                   n_fft=2048, hop_length=512)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-6)

        mel_spec_tensor = torch.FloatTensor(mel_spec_db).unsqueeze(0)
        label = self.labels[idx]

        return mel_spec_tensor, label


class UrbanSoundMFCCDataset(Dataset):
    """Dataset for 1D CNN (MFCC Sequences)"""
    def __init__(self, metadata, soundata_loader, augment=False, n_mfcc=40):
        self.metadata = metadata.reset_index(drop=True)
        self.soundata_loader = soundata_loader
        self.augment = augment
        self.n_mfcc = n_mfcc
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.metadata['class'])

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        clip = self.soundata_loader.clip(row['slice_file_name'])
        y, sr = clip.audio

        # Apply augmentations
        if self.augment:
            if random.random() > 0.5:
                y = librosa.effects.pitch_shift(y, sr=sr, n_steps=random.randint(-3, 3))
            if random.random() > 0.5:
                y = librosa.effects.time_stretch(y, rate=random.uniform(0.8, 1.2))

        # Pad or truncate
        target_length = sr * 4
        if len(y) < target_length:
            y = np.pad(y, (0, target_length - len(y)), mode='constant')
        else:
            y = y[:target_length]

        # Extract MFCC
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
        mfcc_tensor = torch.FloatTensor(mfcc)
        label = self.labels[idx]

        return mfcc_tensor, label


In [None]:

# MODEL CLASSES

class RandomForestModel:
    def __init__(self, random_state=42):
        self.model = None
        self.scaler = StandardScaler()
        self.random_state = random_state
        self.best_params = None

    def train(self, X_train, y_train, hyperparameter_tuning=True, best_params=None):
        """Train Random Forest with optional hyperparameter tuning"""
        print('Training Random Forest Model')

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)

        if hyperparameter_tuning:
            print('Performing Grid Search For Hyperparameter Optimization')
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }

            rf = RandomForestClassifier(random_state=self.random_state, n_jobs=-1)
            grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy',
                                      n_jobs=-1, verbose=2)
            grid_search.fit(X_train_scaled, y_train)

            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f'Best Parameters: {self.best_params}')
        else:
            # Use provided best params or defaults
            if best_params is not None:
                self.model = RandomForestClassifier(random_state=self.random_state,
                                                  n_jobs=-1, **best_params)
            else:
                self.model = RandomForestClassifier(n_estimators=200, random_state=self.random_state,
                                                  n_jobs=-1)
            self.model.fit(X_train_scaled, y_train)

        print('Random Forest Training Complete')

    def predict(self, X_test):
        X_test_scaled = self.scaler.transform(X_test)
        return self.model.predict(X_test_scaled)

    def predict_proba(self, X_test):
        X_test_scaled = self.scaler.transform(X_test)
        return self.model.predict_proba(X_test_scaled)


class SVCModel:
    def __init__(self, random_state=42):
        self.model = None
        self.scaler = StandardScaler()
        self.random_state = random_state
        self.best_params = None

    def train(self, X_train, y_train, hyperparameter_tuning=True, best_params=None):  # ADD best_params=None
        """Train SVC with optional hyperparameter tuning"""
        print('Training SVC Model')

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)

        if hyperparameter_tuning:
            print('Performing Grid Search For Hyperparameter Optimization')
            param_grid = {
                'C': [0.1, 1, 10, 100],
                'gamma': ['scale', 'auto', 0.001, 0.01],
                'kernel': ['rbf', 'poly']
            }

            svc = SVC(random_state=self.random_state, probability=True)
            grid_search = GridSearchCV(svc, param_grid, cv=3, scoring='accuracy',
                                      n_jobs=-1, verbose=2)
            grid_search.fit(X_train_scaled, y_train)

            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f'Best Parameters: {self.best_params}')
        else:
            if best_params is not None:
                self.model = SVC(random_state=self.random_state, probability=True, **best_params)
            else:
                self.model = SVC(kernel='rbf', random_state=self.random_state, probability=True)
            self.model.fit(X_train_scaled, y_train)

        print('SVC Training Complete')
    def predict(self, X_test):
        X_test_scaled = self.scaler.transform(X_test)
        return self.model.predict(X_test_scaled)

    def predict_proba(self, X_test):
        X_test_scaled = self.scaler.transform(X_test)
        return self.model.predict_proba(X_test_scaled)


class KNNModel:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.best_params = None

    def train(self, X_train, y_train, hyperparameter_tuning=True, best_params=None):  # ADD best_params=None
        """Train KNN with optional hyperparameter tuning"""
        print('Training KNN Model')

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)

        if hyperparameter_tuning:
            print('Performing Grid Search For Hyperparameter Optimization')
            param_grid = {
                'n_neighbors': [3, 5, 7, 9, 11],
                'weights': ['uniform', 'distance'],
                'metric': ['euclidean', 'manhattan', 'minkowski']
            }

            knn = KNeighborsClassifier(n_jobs=-1)
            grid_search = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy',
                                      n_jobs=-1, verbose=2)
            grid_search.fit(X_train_scaled, y_train)

            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f'Best Parameters: {self.best_params}')
        else:
            if best_params is not None:
                self.model = KNeighborsClassifier(n_jobs=-1, **best_params)
            else:
                self.model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
            self.model.fit(X_train_scaled, y_train)

        print('KNN Training Complete')

    def predict(self, X_test):
        X_test_scaled = self.scaler.transform(X_test)
        return self.model.predict(X_test_scaled)

    def predict_proba(self, X_test):
        X_test_scaled = self.scaler.transform(X_test)
        return self.model.predict_proba(X_test_scaled)


class ANNModel(nn.Module):
    def __init__(self, input_dim, num_classes=10, hidden_dims=[512, 256, 128]):
        super(ANNModel, self).__init__()

        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, num_classes))
        self.network = nn.Sequential(*layers)
        self.scaler = StandardScaler()

    def forward(self, x):
        return self.network(x)

    def train_model(self, X_train, y_train, X_val=None, y_val=None,
                   epochs=50, batch_size=64, lr=0.001):
        """Train ANN model"""
        print('Training ANN Model')

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)

        # Convert to tensors
        X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
        y_train_tensor = torch.LongTensor(y_train).to(device)

        train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        self.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                         factor=0.5, patience=5)

        for epoch in range(epochs):
            self.train()
            total_loss = 0

            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = self(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)
            scheduler.step(avg_loss)

            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

        print('ANN Training Complete')

    def predict(self, X_test):
        self.eval()
        X_test_scaled = self.scaler.transform(X_test)
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

        with torch.no_grad():
            outputs = self(X_test_tensor)
            _, predicted = torch.max(outputs, 1)

        return predicted.cpu().numpy()

    def predict_proba(self, X_test):
        self.eval()
        X_test_scaled = self.scaler.transform(X_test)
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

        with torch.no_grad():
            outputs = self(X_test_tensor)
            probs = F.softmax(outputs, dim=1)

        return probs.cpu().numpy()


class CNN2DModel(nn.Module):
    def __init__(self, num_classes=10):
        super(CNN2DModel, self).__init__()

        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 4)),
            nn.Dropout(0.25)
        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 4 * 4, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

    def train_model(self, train_loader, val_loader=None, epochs=50, lr=0.001):
        """Train 2D CNN model"""
        print('Training 2D CNN Model')

        self.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                         factor=0.5, patience=5)

        for epoch in range(epochs):
            self.train()
            total_loss = 0

            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                optimizer.zero_grad()
                outputs = self(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)
            scheduler.step(avg_loss)

            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

        print('2D CNN Training Complete')

    def predict(self, test_loader):
        self.eval()
        predictions = []

        with torch.no_grad():
            for batch_X, _ in test_loader:
                batch_X = batch_X.to(device)
                outputs = self(batch_X)
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.cpu().numpy())

        return np.array(predictions)

    def predict_proba(self, test_loader):
        self.eval()
        probabilities = []

        with torch.no_grad():
            for batch_X, _ in test_loader:
                batch_X = batch_X.to(device)
                outputs = self(batch_X)
                probs = F.softmax(outputs, dim=1)
                probabilities.extend(probs.cpu().numpy())

        return np.array(probabilities)


class CNN1DModel(nn.Module):
    def __init__(self, input_channels=40, num_classes=10):
        super(CNN1DModel, self).__init__()

        self.conv_layers = nn.Sequential(
            nn.Conv1d(input_channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(0.25),

            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(0.25),

            nn.Conv1d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(4),
            nn.Dropout(0.25)
        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 4, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

    def train_model(self, train_loader, val_loader=None, epochs=50, lr=0.001):
        """Train 1D CNN model"""
        print('Training 1D CNN Model')

        self.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                         factor=0.5, patience=5)

        for epoch in range(epochs):
            self.train()
            total_loss = 0

            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                optimizer.zero_grad()
                outputs = self(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)
            scheduler.step(avg_loss)

            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

        print('1D CNN Training Complete')

    def predict(self, test_loader):
        self.eval()
        predictions = []

        with torch.no_grad():
            for batch_X, _ in test_loader:
                batch_X = batch_X.to(device)
                outputs = self(batch_X)
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.cpu().numpy())

        return np.array(predictions)

    def predict_proba(self, test_loader):
        self.eval()
        probabilities = []

        with torch.no_grad():
            for batch_X, _ in test_loader:
                batch_X = batch_X.to(device)
                outputs = self(batch_X)
                probs = F.softmax(outputs, dim=1)
                probabilities.extend(probs.cpu().numpy())

        return np.array(probabilities)


from torchvision.models import efficientnet_b0

class AudioEfficientNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AudioEfficientNet, self).__init__()

        # 1. Load Pre-trained EfficientNet-B0
        self.base_model = efficientnet_b0(weights='DEFAULT')

        # 2. Modify Input Layer (1 Channel instead of 3)
        # EfficientNet's first layer is in .features[0][0]
        original_first_layer = self.base_model.features[0][0]

        self.base_model.features[0][0] = nn.Conv2d(
            in_channels=1,
            out_channels=original_first_layer.out_channels,
            kernel_size=original_first_layer.kernel_size,
            stride=original_first_layer.stride,
            padding=original_first_layer.padding,
            bias=False
        )

        # 3. Modify Classifier (Output)
        # EfficientNet's classifier is .classifier[1]
        in_features = self.base_model.classifier[1].in_features
        self.base_model.classifier[1] = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.base_model(x)

In [7]:

# EVALUATION MODULE

class ModelEvaluator:
    def __init__(self, class_names):
        self.class_names = class_names

    def compute_metrics(self, y_true, y_pred, y_proba=None):
        """Compute all evaluation metrics"""
        metrics = {}

        # Basic metrics
        metrics['accuracy'] = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred,
                                                                    average='weighted',
                                                                    zero_division=0)
        metrics['precision'] = precision
        metrics['recall'] = recall
        metrics['f1_score'] = f1

        # Matthews Correlation Coefficient
        metrics['mcc'] = matthews_corrcoef(y_true, y_pred)

        # Confusion Matrix
        metrics['confusion_matrix'] = confusion_matrix(y_true, y_pred)

        # Per-class metrics
        class_report = classification_report(y_true, y_pred,
                                            target_names=self.class_names,
                                            output_dict=True, zero_division=0)
        metrics['classification_report'] = class_report

        # ROC AUC (if probabilities available)
        if y_proba is not None:
            try:
                y_true_bin = label_binarize(y_true, classes=range(len(self.class_names)))
                metrics['roc_auc'] = roc_auc_score(y_true_bin, y_proba,
                                                   average='weighted', multi_class='ovr')
            except:
                metrics['roc_auc'] = None

        return metrics


In [16]:

    def print_metrics(self, metrics, model_name):
        """Print metrics in clean format"""
        print(f'\nResults For {model_name}')
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1 Score: {metrics['f1_score']:.4f}")
        print(f"MCC: {metrics['mcc']:.4f}")
        if metrics.get('roc_auc') is not None:
            print(f"ROC AUC: {metrics['roc_auc']:.4f}")

    def plot_confusion_matrix(self, cm, model_name, fold=None):
        """Plot confusion matrix heatmap"""
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=self.class_names, yticklabels=self.class_names)
        title = f'Confusion Matrix - {model_name}'
        if fold is not None:
            title += f' (Fold {fold})'
        plt.title(title)
        plt.tight_layout()
        plt.show()

    def plot_roc_curves(self, y_true, y_proba, model_name, fold=None):
        """Plot ROC curves for multi-class classification"""
        y_true_bin = label_binarize(y_true, classes=range(len(self.class_names)))

        plt.figure(figsize=(12, 8))

        for i in range(len(self.class_names)):
            fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_proba[:, i])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'{self.class_names[i]} (AUC = {roc_auc:.2f})')

        plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        title = f'ROC Curves - {model_name}'
        if fold is not None:
            title += f' (Fold {fold})'
        plt.title(title)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()


In [None]:

# CROSS-VALIDATION PIPELINE

def cross_validate_traditional_ml(model_class, metadata, feature_extractor,
                                 soundata_loader, model_name,
                                 hyperparameter_tuning=True):
    """10-fold cross-validation for traditional ML models"""
    print(f'\nStarting 10 Fold Cross Validation For {model_name}')

    # Extract features
    feature_data = feature_extractor.extract_all_features(metadata, soundata_loader,
                                                         feature_type='mfcc_stats')

    # Prepare data
    X = np.array(feature_data['features'])
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(feature_data['labels'])
    class_names = label_encoder.classes_

    evaluator = ModelEvaluator(class_names)
    fold_metrics = []
    best_params = None  

    # Iterate through 10 folds
    for fold in range(1, 11):
        print(f'\nTraining On Fold {fold}')

        # Split data
        test_mask = metadata['fold'] == fold
        train_mask = ~test_mask

        X_train = X[train_mask]
        y_train = y[train_mask]
        X_test = X[test_mask]
        y_test = y[test_mask]

        # Train model
        model = model_class()

        # Only tune hyperparameters on first fold
        if fold == 1 and hyperparameter_tuning:
            print('Tuning Hyperparameters On First Fold')
            model.train(X_train, y_train, hyperparameter_tuning=True)
            best_params = model.best_params
            print(f'Best Parameters Will Be Used For Remaining Folds: {best_params}')
        else:
            # Use best params from first fold or defaults
            if best_params is not None:
                print(f'Using Best Parameters From Fold 1: {best_params}')
                model.train(X_train, y_train, hyperparameter_tuning=False, best_params=best_params)
            else:
                model.train(X_train, y_train, hyperparameter_tuning=False)

        # Predict
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        # Evaluate
        metrics = evaluator.compute_metrics(y_test, y_pred, y_proba)
        fold_metrics.append(metrics)

        print(f"Fold {fold} Accuracy: {metrics['accuracy']:.4f}")

    # Aggregate results
    avg_metrics = aggregate_metrics(fold_metrics)
    print(f'\nAverage Results Across 10 Folds For {model_name}')
    print_average_metrics(avg_metrics)

    return fold_metrics, avg_metrics


def cross_validate_ann(metadata, feature_extractor, soundata_loader,
                       model_name='ANN', epochs=50):
    """10-fold cross-validation for ANN"""
    print(f'\nStarting 10 Fold Cross Validation For {model_name}')

    # Extract features
    feature_data = feature_extractor.extract_all_features(metadata, soundata_loader,
                                                         feature_type='mfcc_stats')

    # Prepare data
    X = np.array(feature_data['features'])
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(feature_data['labels'])
    class_names = label_encoder.classes_

    evaluator = ModelEvaluator(class_names)
    fold_metrics = []

    # Iterate through 10 folds
    for fold in range(1, 11):
        print(f'\nTraining On Fold {fold}')

        # Split data
        test_mask = metadata['fold'] == fold
        train_mask = ~test_mask

        X_train = X[train_mask]
        y_train = y[train_mask]
        X_test = X[test_mask]
        y_test = y[test_mask]

        # Train model
        model = ANNModel(input_dim=X_train.shape[1], num_classes=len(class_names))
        model.train_model(X_train, y_train, epochs=epochs)

        # Predict
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        # Evaluate
        metrics = evaluator.compute_metrics(y_test, y_pred, y_proba)
        fold_metrics.append(metrics)

        print(f"Fold {fold} Accuracy: {metrics['accuracy']:.4f}")

    # Aggregate results
    avg_metrics = aggregate_metrics(fold_metrics)
    print(f'\nAverage Results Across 10 Folds For {model_name}')
    print_average_metrics(avg_metrics)

    return fold_metrics, avg_metrics


def cross_validate_cnn2d(metadata, X_all, y_all, model_name='2D CNN', epochs=50, batch_size=32):
    print(f'\nStarting Fast Cross Validation For {model_name}')

    # Setup for Mixed Precision (Faster on Colab Pro GPUs)
    scaler = GradScaler()

    # Get Class Names for metrics
    label_encoder = LabelEncoder()
    label_encoder.fit(metadata['class'])
    class_names = label_encoder.classes_
    evaluator = ModelEvaluator(class_names)

    fold_metrics = []

    for fold in range(1, 11):
        print(f'\nTraining On Fold {fold} (Fast Mode)')

        # 1. Get Indices for this fold
        train_indices = metadata[metadata['fold'] != fold].index.to_numpy()
        test_indices = metadata[metadata['fold'] == fold].index.to_numpy()

        # 2. Create Fast Loaders
        train_loader = get_fast_dataloader(X_all, y_all, train_indices, batch_size, shuffle=True, is_2d=True)
        test_loader = get_fast_dataloader(X_all, y_all, test_indices, batch_size, shuffle=False, is_2d=True)

        # 3. Initialize Model
        model = CNN2DModel(num_classes=len(class_names)).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        # 4. Fast Training Loop
        model.train()
        for epoch in range(epochs):
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                optimizer.zero_grad()

                # Mixed Precision context
                with autocast():
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        # 5. Predict & Evaluate
        model.eval()
        y_pred = []
        y_proba = []
        y_test_labels = []

        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X = batch_X.to(device)
                outputs = model(batch_X)

                # Get preds
                probs = F.softmax(outputs, dim=1)
                predicted = torch.max(probs, 1)[1]

                y_pred.extend(predicted.cpu().numpy())
                y_proba.extend(probs.cpu().numpy())
                y_test_labels.extend(batch_y.numpy())

        # Compute Metrics
        metrics = evaluator.compute_metrics(np.array(y_test_labels), np.array(y_pred), np.array(y_proba))
        fold_metrics.append(metrics)
        print(f"Fold {fold} Accuracy: {metrics['accuracy']:.4f}")

    avg_metrics = aggregate_metrics(fold_metrics)
    print(f'\nAverage Results Across 10 Folds For {model_name}')
    print_average_metrics(avg_metrics)
    return fold_metrics, avg_metrics


def cross_validate_cnn1d(metadata, X_all, y_all, model_name='1D CNN', epochs=50, batch_size=32):
    print(f'\nStarting Fast Cross Validation For {model_name}')
    scaler = GradScaler()

    label_encoder = LabelEncoder()
    label_encoder.fit(metadata['class'])
    class_names = label_encoder.classes_
    evaluator = ModelEvaluator(class_names)

    fold_metrics = []

    for fold in range(1, 11):
        print(f'\nTraining On Fold {fold} (Fast Mode)')

        train_indices = metadata[metadata['fold'] != fold].index.to_numpy()
        test_indices = metadata[metadata['fold'] == fold].index.to_numpy()

        train_loader = get_fast_dataloader(X_all, y_all, train_indices, batch_size, shuffle=True, is_2d=False)
        test_loader = get_fast_dataloader(X_all, y_all, test_indices, batch_size, shuffle=False, is_2d=False)

        model = CNN1DModel(input_channels=40, num_classes=len(class_names)).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.CrossEntropyLoss()

        model.train()
        for epoch in range(epochs):
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                with autocast():
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        model.eval()
        y_pred = []
        y_proba = []
        y_test_labels = []

        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X = batch_X.to(device)
                outputs = model(batch_X)
                probs = F.softmax(outputs, dim=1)
                predicted = torch.max(probs, 1)[1]

                y_pred.extend(predicted.cpu().numpy())
                y_proba.extend(probs.cpu().numpy())
                y_test_labels.extend(batch_y.numpy())

        metrics = evaluator.compute_metrics(np.array(y_test_labels), np.array(y_pred), np.array(y_proba))
        fold_metrics.append(metrics)
        print(f"Fold {fold} Accuracy: {metrics['accuracy']:.4f}")

    avg_metrics = aggregate_metrics(fold_metrics)
    print(f'\nAverage Results Across 10 Folds For {model_name}')
    print_average_metrics(avg_metrics)
    return fold_metrics, avg_metrics


def aggregate_metrics(fold_metrics):
    """Aggregate metrics across folds"""
    metrics_to_avg = ['accuracy', 'precision', 'recall', 'f1_score', 'mcc', 'roc_auc']

    avg_metrics = {}
    for metric in metrics_to_avg:
        values = [fm[metric] for fm in fold_metrics if fm.get(metric) is not None]
        if values:
            avg_metrics[f'{metric}_mean'] = np.mean(values)
            avg_metrics[f'{metric}_std'] = np.std(values)

    # Average confusion matrix
    cms = [fm['confusion_matrix'] for fm in fold_metrics]
    avg_metrics['confusion_matrix'] = np.mean(cms, axis=0).astype(int)

    return avg_metrics


def print_average_metrics(avg_metrics):
    """Print averaged metrics"""
    metrics_names = {
        'accuracy': 'Accuracy',
        'precision': 'Precision',
        'recall': 'Recall',
        'f1_score': 'F1 Score',
        'mcc': 'MCC',
        'roc_auc': 'ROC AUC'
    }

    for metric_key, metric_name in metrics_names.items():
        mean_key = f'{metric_key}_mean'
        std_key = f'{metric_key}_std'
        if mean_key in avg_metrics:
            print(f'{metric_name}: {avg_metrics[mean_key]:.4f} (+/- {avg_metrics[std_key]:.4f})')


def cross_validate_efficientnet(metadata, X_gpu, y_gpu, model_name='EfficientNet-B0', epochs=15, batch_size=64):
    print(f'\nStarting High-Performance Run for {model_name}')

    label_encoder = LabelEncoder()
    label_encoder.fit(metadata['class'])
    class_names = label_encoder.classes_
    evaluator = ModelEvaluator(class_names)
    scaler = GradScaler()
    fold_metrics = []

    for fold in range(1, 11):
        print(f'Fold {fold}...', end=' ')

        # 1. Slice Data
        train_idx = torch.tensor(metadata[metadata['fold'] != fold].index.values).to(device)
        test_idx = torch.tensor(metadata[metadata['fold'] == fold].index.values).to(device)

        X_train, y_train = X_gpu[train_idx], y_gpu[train_idx]
        X_test, y_test = X_gpu[test_idx], y_gpu[test_idx]

        # 2. Initialize Model
        model = AudioEfficientNet(num_classes=len(class_names)).to(device)

        # 3. Optimizer 
        # We use 1e-4 because we are UNFREEZING the backbone.
        # High LR (1e-3) would destroy the pre-trained knowledge.
        optimizer = optim.Adam(model.parameters(), lr=1e-4)
        criterion = nn.CrossEntropyLoss()

        # 4. Train Loop (Manual Batching)
        model.train()
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            perm = torch.randperm(num_samples, device=device)
            for i in range(0, num_samples, batch_size):
                indices = perm[i : i + batch_size]
                batch_X = X_train[indices]
                batch_y = y_train[indices]

                optimizer.zero_grad()
                with autocast():
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        # 5. Eval Loop
        model.eval()
        with torch.no_grad():
            y_pred = []
            y_proba = []
            for i in range(0, X_test.shape[0], batch_size):
                batch_X = X_test[i : i + batch_size]
                outputs = model(batch_X)
                probs = F.softmax(outputs, dim=1)
                preds = torch.max(probs, 1)[1]
                y_pred.extend(preds.cpu().numpy())
                y_proba.extend(probs.cpu().numpy())

            y_test_cpu = y_test.cpu().numpy()

        metrics = evaluator.compute_metrics(np.array(y_test_cpu), np.array(y_pred), np.array(y_proba))
        fold_metrics.append(metrics)
        print(f"Acc: {metrics['accuracy']:.4f}")

    avg_metrics = aggregate_metrics(fold_metrics)
    print_average_metrics(avg_metrics)
    return fold_metrics, avg_metrics


In [17]:
# MAIN EXECUTION

if __name__ == "__main__":
    # Initialize dataset
    print('Initializing UrbanSound8K Dataset')
    urbansound8k = soundata.initialize('urbansound8k')

    # Check if dataset exists, if not download
    if not os.path.exists(urbansound8k.data_home):
        print('Downloading Dataset')
        urbansound8k.download()

    urbansound8k.validate()
    print('Dataset Validated Successfully')


Initializing UrbanSound8K Dataset


100%|██████████| 1/1 [00:00<00:00, 640.35it/s]
100%|██████████| 8732/8732 [00:16<00:00, 537.05it/s]

Dataset Validated Successfully





In [13]:
    # Load metadata
    clip_ids = urbansound8k.clip_ids
    metadata_list = []
    for clip_id in clip_ids:
        clip = urbansound8k.clip(clip_id)
        metadata_list.append({
            'slice_file_name': clip_id,
            'fold': clip.fold,
            'class': clip.tags.labels[0] if clip.tags.labels else 'unknown',
        })

    metadata = pd.DataFrame(metadata_list)
    print(f'Dataset Shape: {metadata.shape}')
    print(f'Number Of Classes: {metadata["class"].nunique()}')


Dataset Shape: (8732, 3)
Number Of Classes: 10


In [14]:
    # Initialize feature extractor
    feature_extractor = FeatureExtractor(cache_dir='./feature_cache')

    # Train and evaluate each model
    results = {}


In [18]:
print("1. Pre-computing Mel Spectrograms (for 2D CNN)")
# Extract Mel Spectrograms for the whole dataset at once
mel_data = feature_extractor.extract_all_features(
    metadata,
    urbansound8k,
    feature_type='mel_spectrogram'
)
# Convert to numpy array
X_mel = np.array([x for x in mel_data['features']])
y_mel = np.array(mel_data['labels'])
print(f"Mel Spectrogram Shape: {X_mel.shape}")

print("\n2. Pre-computing MFCC Sequences (for 1D CNN)")
# Extract MFCC Sequences for the whole dataset at once
mfcc_data = feature_extractor.extract_all_features(
    metadata,
    urbansound8k,
    feature_type='mfcc_sequence'
)
# Convert to numpy array
X_mfcc = np.array([x for x in mfcc_data['features']])
y_mfcc = np.array(mfcc_data['labels'])
print(f"MFCC Sequence Shape: {X_mfcc.shape}")

# Helper to create fast DataLoaders from RAM
def get_fast_dataloader(X_data, y_data, fold_indices, batch_size=32, shuffle=True, is_2d=False):
    # Select data for this fold
    X_fold = X_data[fold_indices]
    y_fold = y_data[fold_indices]

    # Convert to Tensor
    tensor_x = torch.FloatTensor(X_fold)

    # If 2D CNN, we need to add the "Channel" dimension (Batch, 1, Freq, Time)
    if is_2d:
        tensor_x = tensor_x.unsqueeze(1)

    # Encode Labels
    le = LabelEncoder()
    tensor_y = torch.LongTensor(le.fit_transform(y_fold))

    # Create Dataset in RAM
    dataset = TensorDataset(tensor_x, tensor_y)

    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0)

1. Pre-computing Mel Spectrograms (for 2D CNN)
Extracting mel_spectrogram Features


100%|██████████| 8732/8732 [03:40<00:00, 39.57it/s]


Features Cached To feature_cache/mel_spectrogram_features.pkl
Mel Spectrogram Shape: (8732, 128, 345)

2. Pre-computing MFCC Sequences (for 1D CNN)
Extracting mfcc_sequence Features


100%|██████████| 8732/8732 [03:29<00:00, 41.62it/s]


Features Cached To feature_cache/mfcc_sequence_features.pkl
MFCC Sequence Shape: (8732, 40, 345)


In [None]:
    # 1. Random Forest
    print('MODEL 1: RANDOM FOREST')
    rf_fold_metrics, rf_avg_metrics = cross_validate_traditional_ml(
        RandomForestModel, metadata, feature_extractor, urbansound8k,
        model_name='Random Forest', hyperparameter_tuning=True
    )
    results['Random Forest'] = {'fold_metrics': rf_fold_metrics, 'avg_metrics': rf_avg_metrics}


MODEL 1: RANDOM FOREST

Starting 10 Fold Cross Validation For Random Forest
Loading Cached Features From feature_cache/mfcc_stats_features.pkl

Training On Fold 1
Tuning Hyperparameters On First Fold
Training Random Forest Model
Performing Grid Search For Hyperparameter Optimization
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Random Forest Training Complete
Best Parameters Will Be Used For Remaining Folds: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Fold 1 Accuracy: 0.6644

Training On Fold 2
Using Best Parameters From Fold 1: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Training Random Forest Model
Random Forest Training Complete
Fold 2 Accuracy: 0.6802

Training On Fold 3
Using Best Parameters From Fold 1: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimator

In [None]:
    # 2. SVC
    print('MODEL 2: SVC')
    svc_fold_metrics, svc_avg_metrics = cross_validate_traditional_ml(
        SVCModel, metadata, feature_extractor, urbansound8k,
        model_name='SVC', hyperparameter_tuning=True
    )
    results['SVC'] = {'fold_metrics': svc_fold_metrics, 'avg_metrics': svc_avg_metrics}


MODEL 2: SVC

Starting 10 Fold Cross Validation For SVC
Loading Cached Features From feature_cache/mfcc_stats_features.pkl

Training On Fold 1
Tuning Hyperparameters On First Fold
Training SVC Model
Performing Grid Search For Hyperparameter Optimization
Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVC Training Complete
Best Parameters Will Be Used For Remaining Folds: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Fold 1 Accuracy: 0.6449

Training On Fold 2
Using Best Parameters From Fold 1: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Training SVC Model
SVC Training Complete
Fold 2 Accuracy: 0.6610

Training On Fold 3
Using Best Parameters From Fold 1: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Training SVC Model
SVC Training Complete
Fold 3 Accuracy: 0.6530

Training On Fold 4
Using Best Parameters From Fold 1: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Training SVC Model
SVC Training Complete
Fold 4 Accu

In [None]:
    # 3. KNN
    print('MODEL 3: KNN')
    knn_fold_metrics, knn_avg_metrics = cross_validate_traditional_ml(
        KNNModel, metadata, feature_extractor, urbansound8k,
        model_name='KNN', hyperparameter_tuning=True
    )
    results['KNN'] = {'fold_metrics': knn_fold_metrics, 'avg_metrics': knn_avg_metrics}


MODEL 3: KNN

Starting 10 Fold Cross Validation For KNN
Loading Cached Features From feature_cache/mfcc_stats_features.pkl

Training On Fold 1
Tuning Hyperparameters On First Fold
Training KNN Model
Performing Grid Search For Hyperparameter Optimization
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
KNN Training Complete
Best Parameters Will Be Used For Remaining Folds: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Fold 1 Accuracy: 0.5452

Training On Fold 2
Using Best Parameters From Fold 1: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Training KNN Model
KNN Training Complete
Fold 2 Accuracy: 0.5811

Training On Fold 3
Using Best Parameters From Fold 1: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Training KNN Model
KNN Training Complete
Fold 3 Accuracy: 0.5481

Training On Fold 4
Using Best Parameters From Fold 1: {'metric': 'manha

In [None]:
    # 4. ANN
    print('MODEL 4: ANN')
    ann_fold_metrics, ann_avg_metrics = cross_validate_ann(
        metadata, feature_extractor, urbansound8k,
        model_name='ANN', epochs=50
    )
    results['ANN'] = {'fold_metrics': ann_fold_metrics, 'avg_metrics': ann_avg_metrics}


MODEL 4: ANN

Starting 10 Fold Cross Validation For ANN
Loading Cached Features From feature_cache/mfcc_stats_features.pkl

Training On Fold 1
Training ANN Model
Epoch [10/50], Loss: 0.1636
Epoch [20/50], Loss: 0.0982
Epoch [30/50], Loss: 0.0737
Epoch [40/50], Loss: 0.0380
Epoch [50/50], Loss: 0.0499
ANN Training Complete
Fold 1 Accuracy: 0.6518

Training On Fold 2
Training ANN Model
Epoch [10/50], Loss: 0.1691
Epoch [20/50], Loss: 0.0881
Epoch [30/50], Loss: 0.0623
Epoch [40/50], Loss: 0.0528
Epoch [50/50], Loss: 0.0199
ANN Training Complete
Fold 2 Accuracy: 0.6667

Training On Fold 3
Training ANN Model
Epoch [10/50], Loss: 0.1689
Epoch [20/50], Loss: 0.0874
Epoch [30/50], Loss: 0.0659
Epoch [40/50], Loss: 0.0520
Epoch [50/50], Loss: 0.0186
ANN Training Complete
Fold 3 Accuracy: 0.6151

Training On Fold 4
Training ANN Model
Epoch [10/50], Loss: 0.1845
Epoch [20/50], Loss: 0.0857
Epoch [30/50], Loss: 0.0686
Epoch [40/50], Loss: 0.0493
Epoch [50/50], Loss: 0.0170
ANN Training Complete
F

In [None]:
    # 5. 2D CNN
    print('MODEL 5: 2D CNN')
    cnn2d_fold_metrics, cnn2d_avg_metrics = cross_validate_cnn2d(
        metadata,
        X_mel,   # Pass the pre-computed Mel Specs
        y_mel,   # Pass the labels
        model_name='2D CNN',
        epochs=50,
        batch_size=32
    )
    results['2D CNN'] = {'fold_metrics': cnn2d_fold_metrics, 'avg_metrics': cnn2d_avg_metrics}


MODEL 5: 2D CNN

Starting Fast Cross Validation For 2D CNN

Training On Fold 1 (Fast Mode)
Fold 1 Accuracy: 0.7342

Training On Fold 2 (Fast Mode)
Fold 2 Accuracy: 0.7759

Training On Fold 3 (Fast Mode)
Fold 3 Accuracy: 0.7081

Training On Fold 4 (Fast Mode)
Fold 4 Accuracy: 0.7859

Training On Fold 5 (Fast Mode)
Fold 5 Accuracy: 0.7543

Training On Fold 6 (Fast Mode)
Fold 6 Accuracy: 0.7521

Training On Fold 7 (Fast Mode)
Fold 7 Accuracy: 0.7100

Training On Fold 8 (Fast Mode)
Fold 8 Accuracy: 0.7134

Training On Fold 9 (Fast Mode)
Fold 9 Accuracy: 0.7868

Training On Fold 10 (Fast Mode)
Fold 10 Accuracy: 0.7431

Average Results Across 10 Folds For 2D CNN
Accuracy: 0.7464 (+/- 0.0286)
Precision: 0.7698 (+/- 0.0258)
Recall: 0.7464 (+/- 0.0286)
F1 Score: 0.7401 (+/- 0.0321)
MCC: 0.7200 (+/- 0.0302)
ROC AUC: 0.9558 (+/- 0.0170)
MODEL 6: 1D CNN

Starting Fast Cross Validation For 1D CNN

Training On Fold 1 (Fast Mode)
Fold 1 Accuracy: 0.5842

Training On Fold 2 (Fast Mode)
Fold 2 Accuracy

In [23]:
from sklearn.preprocessing import LabelEncoder

print("1. Encoding labels and moving data to GPU...")

# A. Convert String Labels to Integers
le = LabelEncoder()
y_mel_encoded = le.fit_transform(y_mel)

# B. Move to GPU
# Now we pass the integer-encoded labels instead of the strings
X_mel_tensor = torch.FloatTensor(X_mel).to(device)
y_mel_tensor = torch.LongTensor(y_mel_encoded).to(device)

# Add channel dimension if needed (N, 1, H, W)
if X_mel_tensor.ndim == 3:
    X_mel_tensor = X_mel_tensor.unsqueeze(1)

print(f"Success! Data is on {device}.")
print(f"Features: {X_mel_tensor.shape}")
print(f"Labels:   {y_mel_tensor.shape}")

1. Encoding labels and moving data to GPU...
Success! Data is on cuda.
Features: torch.Size([8732, 1, 128, 345])
Labels:   torch.Size([8732])


In [None]:
from torchvision.models import efficientnet_b0
import torch.distributions.beta as beta

# 1. Define the EfficientNet
class SmartAudioEfficientNet(nn.Module):
    def __init__(self, num_classes=10):
        super(SmartAudioEfficientNet, self).__init__()

        # Load Pre-trained
        self.base_model = efficientnet_b0(weights='DEFAULT')

       
        # Instead of a random new layer, we copy the pre-trained weights.
        # We average the 3 RGB channels into 1 Grayscale channel.
        original_layer = self.base_model.features[0][0]
        original_weights = original_layer.weight.data # Shape: [32, 3, 3, 3]

        # Average across the channel dimension (dim=1)
        new_weights = original_weights.mean(dim=1, keepdim=True) # Shape: [32, 1, 3, 3]

        # Create new layer
        self.base_model.features[0][0] = nn.Conv2d(
            in_channels=1,
            out_channels=original_layer.out_channels,
            kernel_size=original_layer.kernel_size,
            stride=original_layer.stride,
            padding=original_layer.padding,
            bias=False
        )

        # Load the averaged weights into the new layer
        self.base_model.features[0][0].weight.data = new_weights

        # Modify Output
        in_features = self.base_model.classifier[1].in_features
        self.base_model.classifier[1] = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.base_model(x)

# 2. MixUp Function (The Secret Sauce)
def mixup_data(x, y, alpha=0.2):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# 3. The New Training Loop
def cross_validate_smart_mixup(metadata, X_gpu, y_gpu, epochs=20, batch_size=64):
    print(f'\nStarting Smart Transfer + MixUp Run')

    label_encoder = LabelEncoder()
    label_encoder.fit(metadata['class'])
    class_names = label_encoder.classes_
    evaluator = ModelEvaluator(class_names)
    scaler = GradScaler()
    fold_metrics = []

    for fold in range(1, 11):
        print(f'Fold {fold}...', end=' ')

        train_idx = torch.tensor(metadata[metadata['fold'] != fold].index.values).to(device)
        test_idx = torch.tensor(metadata[metadata['fold'] == fold].index.values).to(device)

        X_train, y_train = X_gpu[train_idx], y_gpu[train_idx]
        X_test, y_test = X_gpu[test_idx], y_gpu[test_idx]

        model = SmartAudioEfficientNet(num_classes=len(class_names)).to(device)

        # Slightly higher LR because we are using MixUp (harder task)
        optimizer = optim.Adam(model.parameters(), lr=3e-4)
        criterion = nn.CrossEntropyLoss()

        model.train()
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            perm = torch.randperm(num_samples, device=device)
            for i in range(0, num_samples, batch_size):
                indices = perm[i : i + batch_size]
                batch_X = X_train[indices]
                batch_y = y_train[indices]

                optimizer.zero_grad()
                with autocast():
                    # Apply MixUp
                    inputs, targets_a, targets_b, lam = mixup_data(batch_X, batch_y)
                    outputs = model(inputs)
                    loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        # Evaluation (No MixUp here)
        model.eval()
        with torch.no_grad():
            y_pred = []
            y_proba = []

            for i in range(0, X_test.shape[0], batch_size):
                batch_X = X_test[i : i + batch_size]
                outputs = model(batch_X)
                probs = F.softmax(outputs, dim=1)
                preds = torch.max(probs, 1)[1]
                y_pred.extend(preds.cpu().numpy())
                y_proba.extend(probs.cpu().numpy())

            y_test_cpu = y_test.cpu().numpy()

        metrics = evaluator.compute_metrics(np.array(y_test_cpu), np.array(y_pred), np.array(y_proba))
        fold_metrics.append(metrics)
        print(f"Acc: {metrics['accuracy']:.4f}")

    avg_metrics = aggregate_metrics(fold_metrics)
    print_average_metrics(avg_metrics)
    return fold_metrics, avg_metrics

# --- EXECUTION ---
print('MODEL 10: SMART EFFICIENTNET + MIXUP')

smart_fold_metrics, smart_avg_metrics = cross_validate_smart_mixup(
    metadata,
    X_mel_tensor,
    y_mel_tensor,
    epochs=25,     # Increased slightly for MixUp
    batch_size=64
)

results['Smart_EfficientNet'] = {'fold_metrics': smart_fold_metrics, 'avg_metrics': smart_avg_metrics}

MODEL 10: SMART EFFICIENTNET + MIXUP

Starting Smart Transfer + MixUp Run
Fold 1... Acc: 0.7640
Fold 2... Acc: 0.7590
Fold 3... Acc: 0.7265
Fold 4... Acc: 0.8374
Fold 5... Acc: 0.8793
Fold 6... Acc: 0.7959
Fold 7... Acc: 0.8484
Fold 8... Acc: 0.7568
Fold 9... Acc: 0.8554
Fold 10... Acc: 0.8160
Accuracy: 0.8039 (+/- 0.0484)
Precision: 0.8157 (+/- 0.0432)
Recall: 0.8039 (+/- 0.0484)
F1 Score: 0.7974 (+/- 0.0518)
MCC: 0.7833 (+/- 0.0524)
ROC AUC: 0.9679 (+/- 0.0141)
