In [46]:
import torch
import torch.optim as optim
import json
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import librosa
import os
import numpy as np

In [51]:
import torch
import torch.nn as nn

class CNNSoundClassifier(nn.Module):
    def __init__(self, input_channels, n_mels, max_len, num_classes):
        super(CNNSoundClassifier, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        
        # Max pooling layer
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        
        # Flatten layer
        self.flatten = nn.Flatten()
        
        # Calculate the size after conv layers and pooling
        self.fc_input_size = 128 * (n_mels // 8) * (max_len // 8)  # Update this based on input size
        
        # Fully connected layers
        self.fc1 = nn.Linear(self.fc_input_size, 512)
        self.fc2 = nn.Linear(512, num_classes)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))  # Apply Conv1 + ReLU + Pooling
        x = self.pool(self.relu(self.conv2(x)))  # Apply Conv2 + ReLU + Pooling
        x = self.pool(self.relu(self.conv3(x)))  # Apply Conv3 + ReLU + Pooling
        
        x = self.flatten(x)  # Flatten the output of the last conv layer
        x = self.fc1(x)  # Fully connected layer
        x = self.relu(x)
        x = self.fc2(x)  # Output layer
        return self.sigmoid(x)  # Sigmoid activation for binary classification


In [48]:
class AudioDataset(Dataset):
    def __init__(self, audio_files, annotations, sr=22050, n_mels=16, hop_length=256, max_len=4000, root_path='audios'):
        self.audio_files = audio_files
        self.annotations = annotations
        self.sr = sr
        self.n_mels = n_mels
        self.hop_length = hop_length
        self.max_len = max_len
        self.root_path = root_path

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = self.audio_files[idx]
        audio_path = os.path.join(self.root_path, audio_file)
        y, _ = librosa.load(audio_path, sr=self.sr)

        # Generate Mel-spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels, hop_length=self.hop_length)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize Mel-spectrogram
        mel_spec = (mel_spec - mel_spec.mean()) / mel_spec.std()

        # Get timestamps and labels for the audio file
        timestamps = self.annotations[audio_file]
        labels = np.zeros(mel_spec.shape[1])
        for start, end in timestamps:
            start_idx = int(start * self.sr / self.hop_length)
            end_idx = int(end * self.sr / self.hop_length)
            labels[start_idx:end_idx] = 1

        # Ensure Mel-spectrogram and labels match the max length
        if mel_spec.shape[1] > self.max_len:
            mel_spec = mel_spec[:, :self.max_len]
            labels = labels[:self.max_len]
        else:
            pad_width = self.max_len - mel_spec.shape[1]
            mel_spec = np.pad(mel_spec, ((0, 0), (0, pad_width)), mode='constant')
            labels = np.pad(labels, (0, pad_width), mode='constant')

        # Convert Mel-spectrogram to the format expected by CNN: (1, n_mels, time_frames)
        mel_spec = torch.tensor(mel_spec, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
        labels = torch.tensor(labels, dtype=torch.float32)  # Label remains 1D

        return mel_spec, labels

In [53]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    model.train()

    for epoch in range(epochs):
        total_train_loss = 0
        train_correct_predictions = 0
        train_total_samples = 0

        for mel_spec, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(mel_spec)
            loss = criterion(outputs.squeeze(), labels.squeeze())
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            predictions = (outputs.squeeze() > 0.5).float()
            train_correct_predictions += (predictions == labels).sum().item()
            train_total_samples += labels.numel()

        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = train_correct_predictions / train_total_samples

        model.eval()
        total_val_loss = 0
        val_correct_predictions = 0
        val_total_samples = 0

        with torch.no_grad():
            for mel_spec, labels in val_loader:
                outputs = model(mel_spec)
                loss = criterion(outputs.squeeze(), labels.squeeze())
                total_val_loss += loss.item()

                predictions = (outputs.squeeze() > 0.5).float()
                val_correct_predictions += (predictions == labels).sum().item()
                val_total_samples += labels.numel()

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = val_correct_predictions / val_total_samples

        print(f"Epoch {epoch+1}/{epochs} -> "
              f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4%}, "
              f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4%}")

In [54]:
with open("annotations.json", "r") as f:
    annotations = json.load(f)

audio_files = list(annotations.keys())

# Split dataset into training and validation subsets
train_files, val_files = train_test_split(audio_files, test_size=0.2, random_state=42)
train_dataset = AudioDataset(train_files, annotations)
val_dataset = AudioDataset(val_files, annotations)

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# CNN Model Parameters
input_channels = 1  # Mel-spectrogram has 1 channel
num_filters = 32    # Number of filters in convolution layers
filter_size = 3     # Size of convolution filters
output_size = 1     # Binary classification (cheering or no cheering)

# Initialize CNN model
model = CNNSoundClassifier(input_channels, num_filters, filter_size, output_size)

# Define loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)

# Train model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)




RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x128000 and 0x512)