In [None]:
import os
import torch
import pandas as pd
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import shutil
import time
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.notebook import tqdm
from torch.utils.data import random_split, Subset

In [None]:
data_DIR = 'genres' 
sample_rate = 22050 
duration = 30
total_samples = sample_rate * duration 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Parameters for Mel Spectrogram
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512

def log(message , file_path="log.txt"):
    try:
        with open(file_path,'a') as fl:
            fl.write(message + '\n')
    except Exception as e:
        print("File not found")

class Data_Preprocessing(Dataset):
    def __init__(self, data_DIR, sample_rate=22050, duration=30, transform=None):
        self.data_DIR = data_DIR
        self.sample_rate = sample_rate
        self.duration_samples = sample_rate * duration
        self.transform = transform
        self.audio_files = []
        self.labels = []
        self.label_map = {}
        self._load_dataset()

    def _load_dataset(self):
        genres = [d for d in os.listdir(self.data_DIR) if os.path.isdir(os.path.join(self.data_DIR, d))]
        genres.sort()
        for i, genre in enumerate(genres):
            self.label_map[genre] = i
            genre_path = os.path.join(self.data_DIR, genre)
            for audio_file in os.listdir(genre_path):
                if audio_file.endswith('.au'):
                    self.audio_files.append(os.path.join(genre_path, audio_file))
                    self.labels.append(self.label_map[genre])

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        label = self.labels[idx]

        # normalized to [-1, 1]
        waveform, sr = torchaudio.load(audio_path)
        waveform = waveform / waveform.abs().max()
        waveform = waveform.to(device)

        # 1. Resampling to 22050
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate).to(device)
            waveform = resampler(waveform)

        # 2. Padding and Trimming
        if waveform.shape[1] < self.duration_samples:
            padding_needed = self.duration_samples - waveform.shape[1]
            waveform = F.pad(waveform, (0, padding_needed))
        elif waveform.shape[1] > self.duration_samples:
            waveform = waveform[:, :self.duration_samples]

        if self.transform:
            features = self.transform(waveform)
            return features, torch.tensor(label, dtype=torch.long)
        else:
            return waveform, torch.tensor(label, dtype=torch.long)


# MelSpectrogram
mel_spectrogram_transform = nn.Sequential(torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,n_fft=N_FFT,hop_length=HOP_LENGTH,n_mels=N_MELS,power=2.0),torchaudio.transforms.AmplitudeToDB(top_db=100.0)).to(device)

dataset = Data_Preprocessing(data_DIR=data_DIR,sample_rate=sample_rate,duration=duration,transform=mel_spectrogram_transform)


batch_size = 16 #can be adjusted
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

log(f"Total number of audio files: {len(dataset)}")
log(f"Number of genres: {len(dataset.label_map)}")
log(f"Genre map: {dataset.label_map}")

spectogram_dir = 'Spectrogram'
DPI = 100 
Fig_size = (10, 4)

for i in range(len(dataset)):
    features, label_idx = dataset[i]
    features_np = features.squeeze(0).cpu().numpy()

    genre_name = list(dataset.label_map.keys())[label_idx.item()]

    genre_output_dir = os.path.join(spectogram_dir, genre_name)
    if not os.path.exists(genre_output_dir):
        os.makedirs(genre_output_dir)

    audio_filename = os.path.basename(dataset.audio_files[i])
    spectrogram_filename = f"{os.path.splitext(audio_filename)[0]}.png"

    save_path = os.path.join(genre_output_dir, spectrogram_filename)

    # Create plot and save
    plt.figure(figsize=Fig_size, dpi=DPI)
    plt.imshow(features_np, origin='lower', aspect='auto', cmap='magma')
    plt.title(f"Genre: {genre_name}")
    plt.xlabel("Time Frames")
    plt.ylabel("Mel Bins")
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

log(f"All Mel spectrograms saved to '{spectogram_dir}' directory.")

'/kaggle/working/spectrograms.zip'

In [None]:
train_split = 0.8
validation_split = 0.1
test_split = 0.1

total_size = len(dataset)
train_size = int(train_split * total_size)
validation_size = int(validation_split * total_size)
test_size = total_size - train_size - validation_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, validation_size, test_size],generator=torch.Generator().manual_seed(42))

log(f"Dataset created with sizes: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
class AudioCNN_model(nn.Module):
    def __init__(self, num_classes, n_mels, time_frames):
        super(AudioCNN_model, self).__init__()

        # Convolutional 1
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        )

        # Convolutional 2
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        )

        # Convolutional 3
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        )

        dummy_input = torch.randn(1, 1, n_mels, time_frames)
        with torch.no_grad():
            x = self.conv1(dummy_input)
            x = self.conv2(x)
            x = self.conv3(x)
            self._to_linear = x.shape[1] * x.shape[2] * x.shape[3]

        self.fc = nn.Sequential(
            nn.Linear(self._to_linear, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

time_frames = (sample_rate * duration) // HOP_LENGTH + 1

num_classes = len(dataset.label_map)
model = AudioCNN_model(num_classes, N_MELS, time_frames).to(device)

In [None]:

LR = 0.001
Epochs = 20

criterion = nn.CrossEntropyLoss()
optmz = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5)

Plot_DIR ='Plot'
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = [] 

def train_model(model, dataloader, criterion, optmz, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    duration = 0

    for batch_idx, (features, labels) in enumerate(tqdm(dataloader, desc="Training", leave=False)):
        features = features.to(device)
        labels = labels.to(device)

        optmz.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)

        loss.backward()
        optmz.step()

        running_loss += loss.item() * features.size(0)
        _, predicted = torch.max(outputs.data, 1)
        duration += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

    epoch_loss = running_loss / duration
    epoch_accuracy = correct_predictions / duration
    return epoch_loss, epoch_accuracy


# Validation Function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    duration = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for features, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            features = features.to(device)
            labels = labels.to(device)

            outputs = model(features)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * features.size(0)
            _, predicted = torch.max(outputs.data, 1)

            duration += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    epoch_loss = running_loss / duration
    epoch_accuracy = correct_predictions / duration
    return epoch_loss, epoch_accuracy, all_labels, all_predictions


log(f"Starting training on {device} for {Epochs} epochs...")
best_val_accuracy = 0.0

for epoch in range(Epochs):
    start_time = time.time()

    train_loss, train_accuracy = train_model(model, train_dataloader, criterion, optmz, device)
    val_loss, val_accuracy, _, _ = evaluate_model(model, val_dataloader, criterion, device)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)
    end_time = time.time()
    epoch_duration = end_time - start_time

    log(f"Epoch {epoch+1}/{Epochs} - Duration: {epoch_duration:.2f}s")
    log(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
    log(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model.pth')
        log(f"  --> Saved best model with Val Acc: {best_val_accuracy:.4f}")

log("\nTraining complete!")

log("\nEvaluating on Test Set...")
model.load_state_dict(torch.load('best_model.pth'))
test_loss, test_accuracy, true_labels, predictions = evaluate_model(model, test_dataloader, criterion, device)

log(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

log("\nClassification Report:")
target_names = list(dataset.label_map.keys())
log(classification_report(true_labels, predictions, target_names=target_names))




Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:

if not os.path.exists(Plot_DIR):
    os.makedirs(Plot_DIR)
def plot_and_save(train_data, val_data, data_type, filename, ylabel, title):
    epochs = range(1, len(train_data) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_data, label=f'Training {data_type}', marker='o', markersize=4)
    plt.plot(epochs, val_data, label=f'Validation {data_type}', marker='x', markersize=4)

    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    save_path = os.path.join(Plot_DIR, filename) 
    plt.savefig(save_path)
    log(f"Saved {data_type} curve to {save_path}")
    plt.close()

plot_and_save(train_losses, val_losses, 'Loss', 'loss_curves.png', 'Loss', 'Training and Validation Loss Over Epochs')
plot_and_save(train_accuracies, val_accuracies, 'Accuracy', 'accuracy_curves.png', 'Accuracy', 'Training and Validation Accuracy Over Epochs')

'/kaggle/working/Plots.zip'