<a href="https://colab.research.google.com/github/anjorisarabhai/OIBSIP/blob/main/CNN_SPECTROGRAM_ARCHITECTURES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
import numpy as np
from sklearn.metrics import classification_report, f1_score
from google.colab import files
import zipfile
import glob
import sys
import torch.nn.functional as F

# --- CONFIGURATION ---
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'  # File name user uploads
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'  # New standardized name for the output
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FILE_EXTENSION = '.jpg'

# Global variables will be set after extraction
SPECTROGRAM_DIR = None
METADATA_FILE = None
NUM_CLASSES = None
train_loader = None
val_loader = None
genre_names = None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- FILE UPLOAD AND AUTO-CORRECTING EXTRACTION UTILITY ---
def upload_and_extract_data():
    """Prompts for file upload, extracts ZIP, and finds the true paths."""
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Please upload your '{LOCAL_ZIP_NAME}' file now:")
    uploaded = files.upload()
    if not uploaded:
        print("❌ ERROR: No file uploaded. Aborting setup.")
        return False
    uploaded_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    try:
        with zipfile.ZipFile(uploaded_filename, 'r') as zip_ref:
            zip_ref.extractall(LOCAL_BASE_DIR)
        os.remove(uploaded_filename)
        # Find the correct, deep paths for SPECTROGRAM_DIR and METADATA_FILE
        spectro_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
        csv_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
        if not spectro_paths or not csv_paths:
            print(f"❌ FATAL ERROR: Could not find required files inside the extracted data.")
            return False

        # Set the global paths
        SPECTROGRAM_DIR = spectro_paths[0]
        # We will point METADATA_FILE to the temporary output of the cleansing script
        METADATA_FILE = os.path.join(os.path.dirname(csv_paths[0]), CLEANED_CSV_NAME)
        print(f"✅ SUCCESS: Data extracted and paths located.")
        return True
    except Exception as e:
        print(f"❌ ERROR: Failed during extraction or path search: {e}")
        return False

# --- DATA CLEANSING UTILITY (Identify all label columns dynamically) ---
def cleanse_metadata_file():
    """Filters the metadata CSV to use all available labels and synchronizes with physical files."""
    original_csv_path = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    metadata_df = pd.read_csv(original_csv_path)
    print(f"Original metadata columns: {metadata_df.columns.tolist()}")
    print(f"Original metadata sample:\n{metadata_df.head()}")

    # Sync filenames/extensions
    metadata_df['filename'] = metadata_df['filename'].str.replace(r'\.(png|jpeg|jpg)$', SPECTROGRAM_FILE_EXTENSION, regex=True)
    image_files = set(os.listdir(SPECTROGRAM_DIR))
    print(f"Image files found: {len(image_files)}; Examples: {list(image_files)[:5]}")

    # Filter by filenames matching files
    filtered_df = metadata_df[metadata_df['filename'].isin(image_files)].copy()
    print(f"Rows after filename syncing: {len(filtered_df)}")

    # Detect label columns: assume all except 'filename' are labels
    filename_col = filtered_df.columns[0]
    genre_columns = list(filtered_df.columns[1:])  # treat all other columns as labels
    print(f"Label columns detected: {genre_columns}")

    # Prepare final DataFrame
    final_columns = [filename_col] + genre_columns
    final_df = filtered_df[final_columns].copy()

    # Keep only tracks with at least one label
    final_df['label_sum'] = final_df[genre_columns].sum(axis=1)
    final_df = final_df[final_df['label_sum'] > 0].drop(columns=['label_sum'])

    print(f"Rows after removing tracks with no labels: {len(final_df)}")
    print(f"Final labels count: {len(genre_columns)}")

    # Save cleaned data
    final_df.to_csv(METADATA_FILE, index=False)
    print(f"✅ DATA CLEANSING COMPLETE")
    return True

# --- CUSTOM MULTI-LABEL DATASET CLASS ---
class MultiLabelSpectrogramDataset(Dataset):
    def __init__(self, metadata_path, img_dir, transform=None):
        self.metadata_frame = pd.read_csv(metadata_path)
        self.img_dir = img_dir
        self.transform = transform
        # rename first column to 'filename' (for safety)
        self.metadata_frame.rename(columns={self.metadata_frame.columns[0]: 'filename'}, inplace=True)
        self.label_columns = self.metadata_frame.columns[1:].tolist()
        self.num_classes = len(self.label_columns)

    def __len__(self):
        return len(self.metadata_frame)

    def __getitem__(self, idx):
        filename = self.metadata_frame.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, filename)
        image = Image.open(img_path).convert('RGB')
        label_vector = self.metadata_frame.iloc[idx, 1:].values.astype(np.float32)
        label_tensor = torch.tensor(label_vector, dtype=torch.float32)
        if self.transform:
            image = self.transform(image)
        return image, label_tensor

# --- RUN INITIALIZATION ---
if upload_and_extract_data():
    if cleanse_metadata_file():
        # Data transforms
        train_transform = transforms.Compose([
            transforms.RandomRotation(15),
            transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
            transforms.ColorJitter(brightness=0.2, contrast=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
            transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
        ])
        val_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
            transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
        ])

        full_dataset = MultiLabelSpectrogramDataset(
            metadata_path=METADATA_FILE,
            img_dir=SPECTROGRAM_DIR
        )

        global NUM_CLASSES, train_loader, val_loader, genre_names
        NUM_CLASSES = full_dataset.num_classes
        train_size = int(0.8 * len(full_dataset))
        val_size = len(full_dataset) - train_size
        train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

        train_dataset.dataset.transform = train_transform
        val_dataset.dataset.transform = val_transform

        batch_size = 16
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        genre_names = full_dataset.label_columns

Using device: cuda
Please upload your 'project_spectrogram_data.zip' file now:


Saving project_spectrogram_data.zip to project_spectrogram_data.zip
✅ SUCCESS: Data extracted and paths located.
Original metadata columns: ['filename', 'Hip hop', 'Rap', 'R&B / Soul', 'Pop', 'Rock', 'Jazz', 'Blues', 'Country', 'Folk', 'Classical', 'Electronic', 'Funk', 'Reggae / Ska / Dub', 'Metal', 'Gospel / Religious', 'Latin', 'Disco / Dance', 'Ambient / Chill / New Age', 'Experimental / Avant-Garde', 'Opera / Musical Theater / Soundtrack', 'Vocal / A cappella', 'Dancehall / Hip House / Club', 'Psychedelic', 'Other / Niche']
Original metadata sample:
                                   filename  Hip hop  Rap  R&B / Soul  Pop  \
0  00b1397d-7f3e-4c59-bb42-ccd7fa17ee10.jpg        0    0           0    1   
1  00c9dcab-4abf-47f5-9755-c5c805b779c7.jpg        1    1           1    0   
2  012e3459-b54d-49e9-b48d-d0922d295c5a.jpg        0    0           0    1   
3  013a7fe3-0113-4604-a295-f74a0b88bf05.jpg        0    0           0    0   
4  0172efb9-b353-4e55-82cd-80136d98069f.jpg      

VGG16

In [None]:
# --- OPTIMIZED TRAINING FUNCTION (VGG TRANSFER LEARNING) ---

def train_and_evaluate_transfer_learning(model_name, model, train_loader, val_loader, NUM_CLASSES, genre_names, device):
    """
    Two-phase fine-tuning (VGG Transfer Learning) for Multi-Label Classification.
    Fixed device mismatch error and added epoch-wise progress.
    """
    # --- Phase 0: Adapt final layer & move to device
    if model_name.startswith('VGG'):
        num_ftrs = model.classifier[6].in_features
        model.classifier[6] = nn.Linear(num_ftrs, NUM_CLASSES).to(device)  # <-- FIXED DEVICE

    model.to(device)
    criterion = nn.BCEWithLogitsLoss()

    # --- Phase 1: Train classifier head only ---
    for param in model.features.parameters():
        param.requires_grad = False

    optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)
    num_epochs_phase1 = 5

    print(f"\n--- Phase 1: Training {model_name} Classifier Head (5 Epochs) ---")
    for epoch in range(num_epochs_phase1):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Phase 1 - Epoch {epoch+1}/{num_epochs_phase1}, Loss: {running_loss/len(train_loader):.4f}")

    # --- Phase 2: Fine-tune last layers ---
    for param in model.features[24:].parameters():
        param.requires_grad = True
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    num_epochs_phase2 = 40
    print("\nPhase 2: Fine-tuning last layers (40 Epochs)...")
    for epoch in range(num_epochs_phase2):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Phase 2 - Epoch {epoch+1}/{num_epochs_phase2}, Loss: {running_loss/len(train_loader):.4f}")

    # --- FINAL EVALUATION ---
    model.eval()
    y_true_list, y_pred_list = [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted_probs = torch.sigmoid(outputs).cpu().numpy()
            predicted_labels = (predicted_probs > 0.5).astype(int)
            y_true_list.extend(labels.cpu().numpy())
            y_pred_list.extend(predicted_labels)

    y_true = np.array(y_true_list)
    y_pred = np.array(y_pred_list)

    exact_match_accuracy = (y_pred == y_true).all(axis=1).mean()
    from sklearn.metrics import f1_score, classification_report
    weighted_f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    print(f"\n--- Final Evaluation for {model_name} ---")
    print(f"Overall Weighted F1-score: {weighted_f1:.4f}")
    print(f"Exact Match Accuracy: {exact_match_accuracy:.4f}")
    print(classification_report(y_true, y_pred, target_names=genre_names, zero_division=0))


# --- EXECUTION: VGG-16 TRANSFER LEARNING ---

# Define VGG-16 model with pre-trained weights
vgg16_model = models.vgg16(weights='IMAGENET1K_V1')

# Run VGG-16 (Optimized for 50%+ F1-score)
train_and_evaluate_transfer_learning(
    model_name="VGG-16",
    model=vgg16_model,
    train_loader=train_loader,
    val_loader=val_loader,
    NUM_CLASSES=NUM_CLASSES,
    genre_names=genre_names,
    device=device
)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


100%|██████████| 528M/528M [00:05<00:00, 97.8MB/s]



--- Phase 1: Training VGG-16 Classifier Head (5 Epochs) ---
Phase 1 - Epoch 1/5, Loss: 0.2862
Phase 1 - Epoch 2/5, Loss: 0.2420
Phase 1 - Epoch 3/5, Loss: 0.2259
Phase 1 - Epoch 4/5, Loss: 0.2097
Phase 1 - Epoch 5/5, Loss: 0.2065

Phase 2: Fine-tuning last layers (40 Epochs)...
Phase 2 - Epoch 1/40, Loss: 0.1916
Phase 2 - Epoch 2/40, Loss: 0.1795
Phase 2 - Epoch 3/40, Loss: 0.1758
Phase 2 - Epoch 4/40, Loss: 0.1689
Phase 2 - Epoch 5/40, Loss: 0.1650
Phase 2 - Epoch 6/40, Loss: 0.1619
Phase 2 - Epoch 7/40, Loss: 0.1556
Phase 2 - Epoch 8/40, Loss: 0.1580
Phase 2 - Epoch 9/40, Loss: 0.1523
Phase 2 - Epoch 10/40, Loss: 0.1453
Phase 2 - Epoch 11/40, Loss: 0.1443
Phase 2 - Epoch 12/40, Loss: 0.1384
Phase 2 - Epoch 13/40, Loss: 0.1383
Phase 2 - Epoch 14/40, Loss: 0.1363
Phase 2 - Epoch 15/40, Loss: 0.1344
Phase 2 - Epoch 16/40, Loss: 0.1306
Phase 2 - Epoch 17/40, Loss: 0.1283
Phase 2 - Epoch 18/40, Loss: 0.1270
Phase 2 - Epoch 19/40, Loss: 0.1204
Phase 2 - Epoch 20/40, Loss: 0.1208
Phase 2 -

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
import numpy as np
from sklearn.metrics import f1_score
from google.colab import files
import zipfile
import glob

# --- CONFIGURATION ---
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'
SPECTROGRAM_FILE_EXTENSION = '.jpg'
BATCH_SIZE = 32
EPOCHS_PHASE1 = 5
EPOCHS_PHASE2 = 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

SPECTROGRAM_DIR = None
METADATA_FILE = None

# --- UPLOAD AND EXTRACT ZIP ---
def upload_and_extract_data():
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Please upload your '{LOCAL_ZIP_NAME}' file now:")
    uploaded = files.upload()
    if not uploaded:
        print("❌ No file uploaded. Aborting.")
        return False
    zip_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_BASE_DIR)
    os.remove(zip_filename)

    # Locate spectrogram folder and CSV file
    spectro_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
    csv_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
    if not spectro_paths or not csv_paths:
        print("❌ Could not find spectrogram folder or CSV in extracted data.")
        return False

    SPECTROGRAM_DIR = spectro_paths[0]
    METADATA_FILE = os.path.join(os.path.dirname(csv_paths[0]), CLEANED_CSV_NAME)
    print(f"✅ Extraction complete. Spectrogram dir: {SPECTROGRAM_DIR}")
    return True

# --- DATA CLEANSING ---
def cleanse_metadata_file():
    original_csv_path = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    metadata_df = pd.read_csv(original_csv_path)

    # Standardize filename extensions for syncing
    metadata_df['filename'] = metadata_df['filename'].str.replace(r'\.(png|jpeg|jpg)$',
                                                                  SPECTROGRAM_FILE_EXTENSION,
                                                                  regex=True)
    image_files = set(os.listdir(SPECTROGRAM_DIR))
    metadata_df = metadata_df[metadata_df['filename'].isin(image_files)].copy()

    filename_col = metadata_df.columns[0]
    label_cols = list(metadata_df.columns[1:])

    final_columns = [filename_col] + label_cols
    final_df = metadata_df[final_columns].copy()

    # Keep only entries with at least one label
    final_df['label_sum'] = final_df[label_cols].sum(axis=1)
    final_df = final_df[final_df['label_sum'] > 0].drop(columns=['label_sum'])

    final_df.to_csv(METADATA_FILE, index=False)
    print(f"✅ Data cleansing complete with {len(final_df)} samples and {len(label_cols)} labels.")
    return label_cols

# --- DATASET DEFINITION ---
class MultiLabelSpectrogramDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform
        self.labels = self.df.columns[1:].tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx, 0])
        image = Image.open(img_path).convert('RGB')
        label_vec = self.df.iloc[idx, 1:].values.astype(np.float32)
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label_vec, dtype=torch.float32)

# --- DATA AUGMENTATIONS ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0), ratio=(0.9, 1.1)),
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])

# --- TRAINING FUNCTION ---
def train_model():
    if not upload_and_extract_data():
        return
    label_cols = cleanse_metadata_file()

    dataset = MultiLabelSpectrogramDataset(METADATA_FILE, SPECTROGRAM_DIR, transform=None)
    dataset_size = len(dataset)
    train_size = int(0.8 * dataset_size)
    val_size = dataset_size - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])
    train_ds.dataset.transform = train_transform
    val_ds.dataset.transform = val_transform

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    model = models.vgg16(pretrained=True)
    model.classifier[6] = nn.Linear(model.classifier[6].in_features, len(label_cols))
    model = model.to(DEVICE)

    criterion = nn.BCEWithLogitsLoss()

    base_params = [p for n, p in model.named_parameters() if 'classifier' not in n]
    head_params = [p for n, p in model.named_parameters() if 'classifier' in n]

    optimizer = optim.AdamW([
        {'params': base_params, 'lr': 1e-5},
        {'params': head_params, 'lr': 1e-3}
    ], weight_decay=1e-4)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_PHASE2)

    # Phase 1: Train head
    print("--- Phase 1: Training classifier head ---")
    model.train()
    for epoch in range(EPOCHS_PHASE1):
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * inputs.size(0)
        print(f"Epoch {epoch + 1}/{EPOCHS_PHASE1} Loss: {total_loss / len(train_loader.dataset):.4f}")

    # Phase 2: Fine-tuning entire model
    print("\n--- Phase 2: Fine-tuning entire model ---")
    for epoch in range(EPOCHS_PHASE2):
        total_loss = 0
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        scheduler.step()
        print(f"Epoch {epoch + 1}/{EPOCHS_PHASE2} Loss: {total_loss / len(train_loader.dataset):.4f}")

    # Evaluation on validation set
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(preds)
            all_targets.append(targets.numpy())

    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    pred_labels = (all_preds > 0.5).astype(int)
    weighted_f1 = f1_score(all_targets, pred_labels, average='weighted', zero_division=0)
    print(f"\nFinal Weighted F1 score on validation data: {weighted_f1:.4f}")

if __name__ == "__main__":
    train_model()

Using device: cuda
Please upload your 'project_spectrogram_data.zip' file now:


Saving project_spectrogram_data.zip to project_spectrogram_data.zip
✅ Extraction complete. Spectrogram dir: ./project_data/project_spectrogram_data/spectrogram
✅ Data cleansing complete with 980 samples and 24 labels.




--- Phase 1: Training classifier head ---
Epoch 1/5 Loss: 0.2971
Epoch 2/5 Loss: 0.2235
Epoch 3/5 Loss: 0.1830
Epoch 4/5 Loss: 0.1433
Epoch 5/5 Loss: 0.1051

--- Phase 2: Fine-tuning entire model ---
Epoch 1/50 Loss: 0.0000
Epoch 2/50 Loss: 0.0000
Epoch 3/50 Loss: 0.0000
Epoch 4/50 Loss: 0.0000
Epoch 5/50 Loss: 0.0000
Epoch 6/50 Loss: 0.0000
Epoch 7/50 Loss: 0.0000
Epoch 8/50 Loss: 0.0000
Epoch 9/50 Loss: 0.0000
Epoch 10/50 Loss: 0.0000
Epoch 11/50 Loss: 0.0000
Epoch 12/50 Loss: 0.0000
Epoch 13/50 Loss: 0.0000
Epoch 14/50 Loss: 0.0000
Epoch 15/50 Loss: 0.0000
Epoch 16/50 Loss: 0.0000
Epoch 17/50 Loss: 0.0000
Epoch 18/50 Loss: 0.0000
Epoch 19/50 Loss: 0.0000
Epoch 20/50 Loss: 0.0000
Epoch 21/50 Loss: 0.0000
Epoch 22/50 Loss: 0.0000
Epoch 23/50 Loss: 0.0000
Epoch 24/50 Loss: 0.0000
Epoch 25/50 Loss: 0.0000
Epoch 26/50 Loss: 0.0000
Epoch 27/50 Loss: 0.0000
Epoch 28/50 Loss: 0.0000
Epoch 29/50 Loss: 0.0000
Epoch 30/50 Loss: 0.0000
Epoch 31/50 Loss: 0.0000
Epoch 32/50 Loss: 0.0000
Epoch 33/

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
import numpy as np
from sklearn.metrics import f1_score
from google.colab import files
import zipfile
import glob

# --- CONFIG ---
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'
SPECTROGRAM_FILE_EXTENSION = '.jpg'
BATCH_SIZE = 32
EPOCHS_PHASE1 = 5
EPOCHS_PHASE2 = 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

SPECTROGRAM_DIR = None
METADATA_FILE = None

# --- UPLOAD AND EXTRACT ---
def upload_and_extract_data():
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Please upload your '{LOCAL_ZIP_NAME}' file now:")
    uploaded = files.upload()
    if not uploaded:
        print("❌ No file uploaded. Aborting.")
        return False
    zip_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_BASE_DIR)
    os.remove(zip_filename)

    spectro_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
    csv_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
    if not spectro_paths or not csv_paths:
        print("❌ Could not find spectrogram folder or CSV.")
        return False
    SPECTROGRAM_DIR = spectro_paths[0]
    METADATA_FILE = os.path.join(os.path.dirname(csv_paths[0]), CLEANED_CSV_NAME)
    print(f"✅ Extraction complete. Spectrogram dir: {SPECTROGRAM_DIR}")
    return True

# --- DATA CLEANSING ---
def cleanse_metadata_file():
    original_csv_path = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    metadata_df = pd.read_csv(original_csv_path)
    metadata_df['filename'] = metadata_df['filename'].str.replace(r'\.(png|jpeg|jpg)$', SPECTROGRAM_FILE_EXTENSION, regex=True)
    image_files = set(os.listdir(SPECTROGRAM_DIR))
    metadata_df = metadata_df[metadata_df['filename'].isin(image_files)].copy()

    filename_col = metadata_df.columns[0]
    label_cols = list(metadata_df.columns[1:])
    final_columns = [filename_col] + label_cols
    final_df = metadata_df[final_columns].copy()

    # Keep tracks with at least one label
    final_df['label_sum'] = final_df[label_cols].sum(axis=1)
    final_df = final_df[final_df['label_sum'] > 0].drop(columns=['label_sum'])

    final_df.to_csv(METADATA_FILE, index=False)
    print(f"✅ Data cleansing complete with {len(final_df)} samples.")
    return label_cols

# --- Dataset ---
class MultiLabelSpectrogramDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform
        self.labels = self.df.columns[1:].tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, filename)
        image = Image.open(img_path).convert('RGB')
        label_vec = self.df.iloc[idx, 1:].values.astype(np.float32)
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label_vec, dtype=torch.float32)

# --- Data Augmentations ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0)),
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])

# --- Training Function ---
def train_model():
    if not upload_and_extract_data():
        return
    label_cols = cleanse_metadata_file()

    dataset = MultiLabelSpectrogramDataset(METADATA_FILE, SPECTROGRAM_DIR, transform=None)
    dataset_size = len(dataset)
    train_size = int(0.8 * dataset_size)
    val_size = dataset_size - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])
    train_ds.dataset.transform = train_transform
    val_ds.dataset.transform = val_transform

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    # Load VGG16
    model = models.vgg16(pretrained=True)
    model.classifier[6] = nn.Linear(model.classifier[6].in_features, len(label_cols))
    model = model.to(DEVICE)

    criterion = nn.BCEWithLogitsLoss()

    base_params = [p for n, p in model.named_parameters() if 'classifier' not in n]
    head_params = [p for n, p in model.named_parameters() if 'classifier' in n]

    optimizer = optim.AdamW([
        {'params': base_params, 'lr': 1e-5},
        {'params': head_params, 'lr': 1e-3}
    ], weight_decay=1e-4)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_PHASE2)

    # Phase 1
    print("--- Phase 1: Training classifier head ---")
    model.train()
    for epoch in range(EPOCHS_PHASE1):
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * inputs.size(0)
        print(f"Epoch {epoch+1}/{EPOCHS_PHASE1} Loss: {total_loss / len(train_loader.dataset):.4f}")

    # Phase 2
    print("\n--- Phase 2: Fine-tuning entire model ---")
    for epoch in range(EPOCHS_PHASE2):
        total_loss = 0
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * inputs.size(0)
        scheduler.step()
        print(f"Epoch {epoch+1}/{EPOCHS_PHASE2} Loss: {total_loss / len(train_loader.dataset):.4f}")

    # Evaluation
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(preds)
            all_targets.append(targets.numpy())
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    pred_labels = (all_preds > 0.5).astype(int)
    f1 = f1_score(all_targets, pred_labels, average='weighted', zero_division=0)
    print(f"\nFinal Weighted F1 score on validation data: {f1:.4f}")

if __name__ == "__main__":
    train_model()

Using device: cuda
Please upload your 'project_spectrogram_data.zip' file now:


Saving project_spectrogram_data.zip to project_spectrogram_data.zip
✅ Extraction complete. Spectrogram dir: ./project_data/project_spectrogram_data/spectrogram
✅ Data cleansing complete with 980 samples.




--- Phase 1: Training classifier head ---
Epoch 1/5 Loss: 0.2885
Epoch 2/5 Loss: 0.2213
Epoch 3/5 Loss: 0.1784
Epoch 4/5 Loss: 0.1358
Epoch 5/5 Loss: 0.0942

--- Phase 2: Fine-tuning entire model ---
Epoch 1/50 Loss: 0.0697
Epoch 2/50 Loss: 0.0537
Epoch 3/50 Loss: 0.0409
Epoch 4/50 Loss: 0.0305
Epoch 5/50 Loss: 0.0221
Epoch 6/50 Loss: 0.0195
Epoch 7/50 Loss: 0.0147
Epoch 8/50 Loss: 0.0144
Epoch 9/50 Loss: 0.0136
Epoch 10/50 Loss: 0.0111
Epoch 11/50 Loss: 0.0098
Epoch 12/50 Loss: 0.0071
Epoch 13/50 Loss: 0.0074
Epoch 14/50 Loss: 0.0072
Epoch 15/50 Loss: 0.0051
Epoch 16/50 Loss: 0.0045
Epoch 17/50 Loss: 0.0051
Epoch 18/50 Loss: 0.0036
Epoch 19/50 Loss: 0.0057
Epoch 20/50 Loss: 0.0043
Epoch 21/50 Loss: 0.0048
Epoch 22/50 Loss: 0.0034
Epoch 23/50 Loss: 0.0021
Epoch 24/50 Loss: 0.0020
Epoch 25/50 Loss: 0.0023
Epoch 26/50 Loss: 0.0011
Epoch 27/50 Loss: 0.0020
Epoch 28/50 Loss: 0.0019
Epoch 29/50 Loss: 0.0028
Epoch 30/50 Loss: 0.0021
Epoch 31/50 Loss: 0.0011
Epoch 32/50 Loss: 0.0011
Epoch 33/

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix
from google.colab import files
import zipfile
import glob

# --- CONFIG ---
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'
SPECTROGRAM_FILE_EXTENSION = '.jpg'
BATCH_SIZE = 32
EPOCHS_PHASE1 = 5
EPOCHS_PHASE2 = 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

SPECTROGRAM_DIR = None
METADATA_FILE = None

# --- UPLOAD AND EXTRACT ---
def upload_and_extract_data():
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Please upload your '{LOCAL_ZIP_NAME}' file now:")
    uploaded = files.upload()
    if not uploaded:
        print("❌ No file uploaded. Aborting.")
        return False
    zip_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_BASE_DIR)
    os.remove(zip_filename)

    spectro_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
    csv_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
    if not spectro_paths or not csv_paths:
        print("❌ Could not find spectrogram folder or CSV.")
        return False
    SPECTROGRAM_DIR = spectro_paths[0]
    METADATA_FILE = os.path.join(os.path.dirname(csv_paths[0]), CLEANED_CSV_NAME)
    print(f"✅ Extraction complete. Spectrogram dir: {SPECTROGRAM_DIR}")
    return True

# --- DATA CLEANSING ---
def cleanse_metadata_file():
    original_csv_path = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    metadata_df = pd.read_csv(original_csv_path)
    metadata_df['filename'] = metadata_df['filename'].str.replace(r'\.(png|jpeg|jpg)$', SPECTROGRAM_FILE_EXTENSION, regex=True)
    image_files = set(os.listdir(SPECTROGRAM_DIR))
    metadata_df = metadata_df[metadata_df['filename'].isin(image_files)].copy()

    filename_col = metadata_df.columns[0]
    label_cols = list(metadata_df.columns[1:])
    final_columns = [filename_col] + label_cols
    final_df = metadata_df[final_columns].copy()

    final_df['label_sum'] = final_df[label_cols].sum(axis=1)
    final_df = final_df[final_df['label_sum'] > 0].drop(columns=['label_sum'])

    final_df.to_csv(METADATA_FILE, index=False)
    print(f"✅ Data cleansing complete with {len(final_df)} samples.")
    return label_cols

# --- Dataset ---
class MultiLabelSpectrogramDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform
        self.labels = self.df.columns[1:].tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, filename)
        image = Image.open(img_path).convert('RGB')
        label_vec = self.df.iloc[idx, 1:].values.astype(np.float32)
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label_vec, dtype=torch.float32)

# --- Data Augmentations ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0)),
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), ratio=(0.3, 3.3)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])

# --- TRAINING AND EVALUATION ---
def train_and_evaluate():
    if not upload_and_extract_data():
        return
    label_cols = cleanse_metadata_file()

    dataset = MultiLabelSpectrogramDataset(METADATA_FILE, SPECTROGRAM_DIR, transform=None)
    dataset_size = len(dataset)
    train_size = int(0.8 * dataset_size)
    val_size = dataset_size - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])
    train_ds.dataset.transform = train_transform
    val_ds.dataset.transform = val_transform

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    # Load VGG16
    model = models.vgg16(pretrained=True)
    model.classifier[6] = nn.Linear(model.classifier[6].in_features, len(label_cols))
    model = model.to(DEVICE)

    criterion = nn.BCEWithLogitsLoss()

    base_params = [p for n, p in model.named_parameters() if 'classifier' not in n]
    head_params = [p for n, p in model.named_parameters() if 'classifier' in n]

    optimizer = optim.AdamW([
        {'params': base_params, 'lr': 1e-5},
        {'params': head_params, 'lr': 1e-3}
    ], weight_decay=1e-4)

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_PHASE2)

    # --- Phase 1: Train classifier head ---
    print("--- Phase 1: Training classifier head ---")
    for epoch in range(EPOCHS_PHASE1):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * inputs.size(0)
        print(f"Epoch {epoch+1}/{EPOCHS_PHASE1} Loss: {total_loss/len(train_loader.dataset):.4f}")

    # --- Phase 2: Fine-tune entire model ---
    print("\n--- Phase 2: Fine-tuning entire model ---")
    for epoch in range(EPOCHS_PHASE2):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * inputs.size(0)
        scheduler.step()
        print(f"Epoch {epoch+1}/{EPOCHS_PHASE2} Loss: {total_loss/len(train_loader.dataset):.4f}")

    # --- Evaluation ---
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(DEVICE)
            outputs = model(inputs)
            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(preds)
            all_targets.append(targets.numpy())

    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    pred_labels = (all_preds > 0.5).astype(int)

    # Overall exact-match accuracy (all labels must match)
    overall_accuracy = np.mean(np.all(pred_labels == all_targets, axis=1))
    print(f"\nOverall Exact-Match Accuracy: {overall_accuracy:.4f}")

    # Overall weighted metrics
    weighted_f1 = f1_score(all_targets, pred_labels, average='weighted', zero_division=0)
    weighted_precision = precision_score(all_targets, pred_labels, average='weighted', zero_division=0)
    weighted_recall = recall_score(all_targets, pred_labels, average='weighted', zero_division=0)
    print(f"\nOverall Weighted Metrics:")
    print(f"F1 Score: {weighted_f1:.4f}")
    print(f"Precision: {weighted_precision:.4f}")
    print(f"Recall: {weighted_recall:.4f}")

    # Per-label metrics
    metrics_list = []
    label_names = dataset.labels
    for i, label in enumerate(label_names):
        acc = accuracy_score(all_targets[:, i], pred_labels[:, i])
        prec = precision_score(all_targets[:, i], pred_labels[:, i], zero_division=0)
        rec = recall_score(all_targets[:, i], pred_labels[:, i], zero_division=0)
        f1 = f1_score(all_targets[:, i], pred_labels[:, i], zero_division=0)
        metrics_list.append([label, acc, prec, rec, f1])

    metrics_df = pd.DataFrame(metrics_list, columns=['Label', 'Accuracy', 'Precision', 'Recall', 'F1'])
    print("\nPer-label Metrics:")
    print(metrics_df)

    # Multilabel confusion matrices
    conf_matrices = multilabel_confusion_matrix(all_targets, pred_labels)
    for i, label in enumerate(label_names):
        print(f"\nConfusion Matrix for '{label}':")
        print(conf_matrices[i])

if __name__ == "__main__":
    train_and_evaluate()

Using device: cuda
Please upload your 'project_spectrogram_data.zip' file now:


Saving project_spectrogram_data.zip to project_spectrogram_data.zip
✅ Extraction complete. Spectrogram dir: ./project_data/project_spectrogram_data/spectrogram
✅ Data cleansing complete with 980 samples.




--- Phase 1: Training classifier head ---
Epoch 1/5 Loss: 0.3085
Epoch 2/5 Loss: 0.2238
Epoch 3/5 Loss: 0.1931
Epoch 4/5 Loss: 0.1528
Epoch 5/5 Loss: 0.1112

--- Phase 2: Fine-tuning entire model ---
Epoch 1/50 Loss: 0.0773
Epoch 2/50 Loss: 0.0587
Epoch 3/50 Loss: 0.0431
Epoch 4/50 Loss: 0.0345
Epoch 5/50 Loss: 0.0255
Epoch 6/50 Loss: 0.0224
Epoch 7/50 Loss: 0.0153
Epoch 8/50 Loss: 0.0156
Epoch 9/50 Loss: 0.0129
Epoch 10/50 Loss: 0.0159
Epoch 11/50 Loss: 0.0119
Epoch 12/50 Loss: 0.0073
Epoch 13/50 Loss: 0.0101
Epoch 14/50 Loss: 0.0083
Epoch 15/50 Loss: 0.0059
Epoch 16/50 Loss: 0.0050
Epoch 17/50 Loss: 0.0035
Epoch 18/50 Loss: 0.0046
Epoch 19/50 Loss: 0.0034
Epoch 20/50 Loss: 0.0036
Epoch 21/50 Loss: 0.0020
Epoch 22/50 Loss: 0.0035
Epoch 23/50 Loss: 0.0030
Epoch 24/50 Loss: 0.0031
Epoch 25/50 Loss: 0.0020
Epoch 26/50 Loss: 0.0036
Epoch 27/50 Loss: 0.0024
Epoch 28/50 Loss: 0.0018
Epoch 29/50 Loss: 0.0019
Epoch 30/50 Loss: 0.0007
Epoch 31/50 Loss: 0.0019
Epoch 32/50 Loss: 0.0011
Epoch 33/

EFFICIENT B0

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import numpy as np
from sklearn.metrics import classification_report, f1_score
from google.colab import files
import zipfile
import glob
import os

# --- CONFIG ---
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'
SPECTROGRAM_FILE_EXTENSION = '.jpg'
BATCH_SIZE = 32
EPOCHS_HEAD = 5
EPOCHS_FULL = 75
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

SPECTROGRAM_DIR = None
METADATA_FILE = None

def upload_and_extract_data():
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Upload your '{LOCAL_ZIP_NAME}':")
    uploaded = files.upload()
    if not uploaded:
        print("❌ No upload detected")
        return False
    zip_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_BASE_DIR)
    os.remove(zip_filename)
    # Find paths
    spectro_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
    csv_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
    if not spectro_paths or not csv_paths:
        print("❌ Required data not found!")
        return False
    SPECTROGRAM_DIR = spectro_paths[0]
    METADATA_FILE = os.path.join(os.path.dirname(csv_paths[0]), CLEANED_CSV_NAME)
    print("✅ Extraction successful.")
    return True

def cleanse_metadata_file():
    original_csv_path = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    df = pd.read_csv(original_csv_path)
    # Fix extensions
    df['filename'] = df['filename'].str.replace(r'\.(png|jpeg|jpg)$', SPECTROGRAM_FILE_EXTENSION, regex=True)
    files_set = set(os.listdir(SPECTROGRAM_DIR))
    df = df[df['filename'].isin(files_set)].copy()
    label_cols = df.columns[1:]
    # Remove if no labels
    df['label_sum'] = df[label_cols].sum(axis=1)
    df = df[df['label_sum'] > 0].drop(columns=['label_sum'])
    df.to_csv(METADATA_FILE, index=False)
    print(f"✅ Data cleansed: {len(df)} samples, {len(label_cols)} labels")
    return label_cols

class SpectrogramDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform
        self.labels = self.df.columns[1:].tolist()
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx,0])
        img = Image.open(img_path).convert('RGB')
        label = self.df.iloc[idx,1:].values.astype(np.float32)
        if self.transform:
            img = self.transform(img)
        return img, torch.tensor(label)

# Data Augmentations
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.5, 1.0)),
    transforms.RandomRotation(45),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.5, 0.5, 0.5, 0.15),
    transforms.RandomErasing(p=0.7),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3,1,1) if x.size(0) == 1 else x),
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3,1,1) if x.size(0) == 1 else x),
])

def train():
    if not upload_and_extract_data():
        return
    labels = cleanse_metadata_file()
    dataset = SpectrogramDataset(METADATA_FILE, SPECTROGRAM_DIR, transform=None)
    train_len = int(len(dataset)*0.8)
    val_len = len(dataset) - train_len
    train_ds, val_ds = random_split(dataset, [train_len, val_len])
    train_ds.dataset.transform = train_transform
    val_ds.dataset.transform = val_transform
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    # Load model
    model = models.efficientnet_b0(pretrained=True)
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, len(labels))
    model = model.to(DEVICE)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3,
                                              steps_per_epoch=len(train_loader), epochs=EPOCHS_HEAD+EPOCHS_FULL)

    # Phase 1: Train only classifier head
    for param in model.features.parameters():
        param.requires_grad = False
    print("Training classifier head...")
    for epoch in range(EPOCHS_HEAD):
        model.train()
        total_loss = 0
        for imgs, targets in train_loader:
            imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item() * imgs.size(0)
        print(f"Head Epoch {epoch+1} Loss: {total_loss/len(train_loader.dataset):.4f}")

    # Phase 2: Fine-tune entire model
    for param in model.features.parameters():
        param.requires_grad = True
    print("Fine-tuning entire model...")
    for epoch in range(EPOCHS_FULL):
        model.train()
        total_loss = 0
        for imgs, targets in train_loader:
            imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item() * imgs.size(0)
        print(f"Fine-tune Epoch {epoch+1} Loss: {total_loss/len(train_loader.dataset):.4f}")

    # Evaluation
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs = imgs.to(DEVICE)
            outputs = model(imgs)
            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(preds)
            all_targets.append(targets.numpy())
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    pred_labels = (all_preds > 0.5).astype(int)

    from sklearn.metrics import classification_report, f1_score
    weighted_f1 = f1_score(all_targets, pred_labels, average='weighted', zero_division=0)
    print(f"\nWeighted F1 score: {weighted_f1:.4f}")
    print("\nClassification Report:\n")
    print(classification_report(all_targets, pred_labels, target_names=labels, zero_division=0))

if __name__ == "__main__":
    train()

Using device: cuda
Upload your 'project_spectrogram_data.zip':


Saving project_spectrogram_data.zip to project_spectrogram_data.zip




✅ Extraction successful.
✅ Data cleansed: 980 samples, 24 labels
Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|██████████| 20.5M/20.5M [00:00<00:00, 114MB/s] 


Training classifier head...
Head Epoch 1 Loss: 0.6583
Head Epoch 2 Loss: 0.5846
Head Epoch 3 Loss: 0.5128
Head Epoch 4 Loss: 0.4410
Head Epoch 5 Loss: 0.3815
Fine-tuning entire model...
Fine-tune Epoch 1 Loss: 0.2761
Fine-tune Epoch 2 Loss: 0.2122
Fine-tune Epoch 3 Loss: 0.1780
Fine-tune Epoch 4 Loss: 0.1509
Fine-tune Epoch 5 Loss: 0.1278
Fine-tune Epoch 6 Loss: 0.1081
Fine-tune Epoch 7 Loss: 0.0953
Fine-tune Epoch 8 Loss: 0.0821
Fine-tune Epoch 9 Loss: 0.0757
Fine-tune Epoch 10 Loss: 0.0691
Fine-tune Epoch 11 Loss: 0.0602
Fine-tune Epoch 12 Loss: 0.0613
Fine-tune Epoch 13 Loss: 0.0595
Fine-tune Epoch 14 Loss: 0.0480
Fine-tune Epoch 15 Loss: 0.0431
Fine-tune Epoch 16 Loss: 0.0337
Fine-tune Epoch 17 Loss: 0.0249
Fine-tune Epoch 18 Loss: 0.0255
Fine-tune Epoch 19 Loss: 0.0241
Fine-tune Epoch 20 Loss: 0.0202
Fine-tune Epoch 21 Loss: 0.0177
Fine-tune Epoch 22 Loss: 0.0142
Fine-tune Epoch 23 Loss: 0.0149
Fine-tune Epoch 24 Loss: 0.0142
Fine-tune Epoch 25 Loss: 0.0145
Fine-tune Epoch 26 Loss

EFFICIENTNET B3

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import os
import numpy as np
from sklearn.metrics import f1_score, classification_report
from google.colab import files
import zipfile
import glob

# -------------- CONFIG --------------
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'
SPECTROGRAM_FILE_EXTENSION = '.jpg'
BATCH_SIZE = 32
EPOCHS_HEAD = 8
EPOCHS_FULL = 90
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# -------------- DATA PREP --------------
SPECTROGRAM_DIR = None
METADATA_FILE = None

def upload_and_extract():
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Upload '{LOCAL_ZIP_NAME}' now:")
    uploaded = files.upload()
    zip_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_BASE_DIR)
    os.remove(zip_filename)
    spectros = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
    csvs = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
    if not spectros or not csvs:
        raise RuntimeError("Spectrogram folder or CSV not found after extraction!")
    SPECTROGRAM_DIR = spectros[0]
    METADATA_FILE = os.path.join(os.path.dirname(csvs[0]), CLEANED_CSV_NAME)

def cleanse_metadata():
    orig_csv = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    df = pd.read_csv(orig_csv)
    df['filename'] = df['filename'].str.replace(r'\.(png|jpeg|jpg)$', SPECTROGRAM_FILE_EXTENSION, regex=True)
    valid_files = set(os.listdir(SPECTROGRAM_DIR))
    df = df[df['filename'].isin(valid_files)].copy()
    label_cols = df.columns[1:]
    df['label_sum'] = df[label_cols].sum(axis=1)
    df = df[df['label_sum'] > 0].drop(columns=['label_sum'])
    df.to_csv(METADATA_FILE, index=False)
    print(f"✅ {len(df)} samples after cleaning. {len(label_cols)} labels.")
    return label_cols

class SpectrogramDS(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform
        self.labels = self.df.columns[1:].tolist()
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        img_fp = os.path.join(self.img_dir, self.df.iloc[idx,0])
        img = Image.open(img_fp).convert('RGB')
        label = self.df.iloc[idx,1:].values.astype(np.float32)
        if self.transform:
            img = self.transform(img)
        return img, torch.tensor(label, dtype=torch.float32)

# -------------- AUGMENTATION --------------
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(300, scale=(0.35, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.ColorJitter(.6, .6, .6, .2),
    transforms.RandomErasing(p=0.8, value='random'),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3,1,1) if x.size(0)==1 else x),
])
val_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3,1,1) if x.size(0)==1 else x)
])

# -------------- ADVANCED MODEL (EfficientNet-B3) --------------
# If not installed: pip install timm
import timm
def get_model(num_classes):
    model = timm.create_model('efficientnet_b3a', pretrained=True)
    model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    return model.to(DEVICE)

# -------------- TRAINING + EVAL --------------
def get_class_weights(ds):
    y = pd.read_csv(METADATA_FILE).iloc[:,1:].values
    pos_weights = 1. / (np.sum(y, axis=0) + 1e-4)
    pos_weights = torch.tensor(pos_weights, dtype=torch.float32).to(DEVICE)
    return pos_weights

def tune_thresholds(y_true, y_pred):
    # Tune per-class threshold to maximize f1
    from sklearn.metrics import f1_score
    thresholds = np.arange(0.3, 0.7, 0.02)
    best_thr = []
    for i in range(y_true.shape[1]):
        best = 0.5; bestf1=0
        for t in thresholds:
            f = f1_score(y_true[:,i], (y_pred[:,i]>=t).astype(int), zero_division=0)
            if f > bestf1:
                bestf1 = f
                best = t
        best_thr.append(best)
    return np.array(best_thr)

def main():
    upload_and_extract()
    label_cols = cleanse_metadata()
    dataset = SpectrogramDS(METADATA_FILE, SPECTROGRAM_DIR, transform=None)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])
    train_ds.dataset.transform = train_transform
    val_ds.dataset.transform = val_transform
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    # Model & optimizer
    model = get_model(len(label_cols))
    weights = get_class_weights(dataset)
    criterion = nn.BCEWithLogitsLoss(pos_weight=weights)
    optimizer = optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-3,
                                              steps_per_epoch=len(train_loader),
                                              epochs=EPOCHS_HEAD+EPOCHS_FULL)

    # Freeze backbone for head train
    for p in model.parameters(): p.requires_grad = False
    for p in model.classifier.parameters(): p.requires_grad = True

    print("Phase 1: Classifier head...")
    for epoch in range(EPOCHS_HEAD):
        model.train(); epoch_loss=0
        for imgs, ys in train_loader:
            imgs, ys = imgs.to(DEVICE), ys.to(DEVICE)
            optimizer.zero_grad()
            outs = model(imgs)
            loss = criterion(outs, ys)
            loss.backward(); optimizer.step(); scheduler.step()
            epoch_loss += loss.item() * imgs.size(0)
        print(f"Epoch {epoch+1}/{EPOCHS_HEAD}, Loss: {epoch_loss/len(train_loader.dataset):.4f}")

    # Unfreeze whole model
    for p in model.parameters(): p.requires_grad = True

    print("Phase 2: Full fine-tuning...")
    for epoch in range(EPOCHS_FULL):
        model.train(); epoch_loss=0
        for imgs, ys in train_loader:
            imgs, ys = imgs.to(DEVICE), ys.to(DEVICE)
            optimizer.zero_grad()
            outs = model(imgs)
            loss = criterion(outs, ys)
            loss.backward(); optimizer.step(); scheduler.step()
            epoch_loss += loss.item() * imgs.size(0)
        print(f"Epoch {epoch+1}/{EPOCHS_FULL}, Loss: {epoch_loss/len(train_loader.dataset):.4f}")

    # Evaluation
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for imgs, ys in val_loader:
            imgs = imgs.to(DEVICE)
            outs = torch.sigmoid(model(imgs)).cpu().numpy()
            all_preds.append(outs)
            all_targets.append(ys.numpy())
    all_preds, all_targets = np.vstack(all_preds), np.vstack(all_targets)

    # ----------- Threshold tuning -----------
    best_thr = tune_thresholds(all_targets, all_preds)
    tuned_pred = (all_preds >= best_thr).astype(int)
    f1 = f1_score(all_targets, tuned_pred, average='weighted', zero_division=0)
    print(f"\nWeighted F1 after threshold tuning: {f1:.4f}")
    print("\nClassification Report:\n")
    print(classification_report(all_targets, tuned_pred, target_names=label_cols, zero_division=0))
    exact_match = np.mean(np.all(tuned_pred == all_targets, axis=1))
    print(f"\nExact Match Accuracy: {exact_match:.4f}")

if __name__ == "__main__":
    main()

Upload 'project_spectrogram_data.zip' now:


Saving project_spectrogram_data.zip to project_spectrogram_data.zip
✅ 980 samples after cleaning. 24 labels.


  model = create_fn(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

Phase 1: Classifier head...
Epoch 1/8, Loss: 0.5486
Epoch 2/8, Loss: 0.3943
Epoch 3/8, Loss: 0.2704
Epoch 4/8, Loss: 0.1816
Epoch 5/8, Loss: 0.1188
Epoch 6/8, Loss: 0.0806
Epoch 7/8, Loss: 0.0576
Epoch 8/8, Loss: 0.0433
Phase 2: Full fine-tuning...
Epoch 1/90, Loss: 0.0110
Epoch 2/90, Loss: 0.0077
Epoch 3/90, Loss: 0.0075
Epoch 4/90, Loss: 0.0078
Epoch 5/90, Loss: 0.0076
Epoch 6/90, Loss: 0.0075
Epoch 7/90, Loss: 0.0079
Epoch 8/90, Loss: 0.0075
Epoch 9/90, Loss: 0.0070
Epoch 10/90, Loss: 0.0069
Epoch 11/90, Loss: 0.0069
Epoch 12/90, Loss: 0.0063
Epoch 13/90, Loss: 0.0067
Epoch 14/90, Loss: 0.0066
Epoch 15/90, Loss: 0.0062
Epoch 16/90, Loss: 0.0050
Epoch 17/90, Loss: 0.0045
Epoch 18/90, Loss: 0.0040
Epoch 19/90, Loss: 0.0044
Epoch 20/90, Loss: 0.0050
Epoch 21/90, Loss: 0.0050
Epoch 22/90, Loss: 0.0043
Epoch 23/90, Loss: 0.0044
Epoch 24/90, Loss: 0.0049
Epoch 25/90, Loss: 0.0058
Epoch 26/90, Loss: 0.0057
Epoch 27/90, Loss: 0.0050
Epoch 28/90, Loss: 0.0042
Epoch 29/90, Loss: 0.0040
Epoch 

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import numpy as np
from sklearn.metrics import f1_score, classification_report
from google.colab import files
import zipfile
import glob
import os

# CONFIG
LOCAL_ZIP_NAME = 'project_spectrogram_data.zip'
LOCAL_BASE_DIR = './project_data/'
SPECTROGRAM_FOLDER_NAME = 'spectrogram'
LOCAL_CSV_NAME = 'consolidated_genres.csv'
CLEANED_CSV_NAME = 'cleaned_final_metadata.csv'
SPECTROGRAM_FILE_EXTENSION = '.jpg'
BATCH_SIZE = 32
EPOCHS_HEAD = 5
EPOCHS_FULL = 100
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SPECTROGRAM_DIR = None
METADATA_FILE = None

class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=0.5):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
    def forward(self, inputs, targets):
        BCE_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

def upload_and_extract_data():
    global SPECTROGRAM_DIR, METADATA_FILE
    print(f"Upload your '{LOCAL_ZIP_NAME}':")
    uploaded = files.upload()
    if not uploaded:
        print("❌ No upload detected")
        return False
    zip_filename = list(uploaded.keys())[0]
    os.makedirs(LOCAL_BASE_DIR, exist_ok=True)
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(LOCAL_BASE_DIR)
    os.remove(zip_filename)
    spectro_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', SPECTROGRAM_FOLDER_NAME), recursive=True)
    csv_paths = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)
    if not spectro_paths or not csv_paths:
        print("❌ Required data not found!")
        return False
    SPECTROGRAM_DIR = spectro_paths[0]
    METADATA_FILE = os.path.join(os.path.dirname(csv_paths[0]), CLEANED_CSV_NAME)
    print("✅ Extraction successful.")
    return True

def cleanse_metadata_file():
    original_csv_path = glob.glob(os.path.join(LOCAL_BASE_DIR, '**', LOCAL_CSV_NAME), recursive=True)[0]
    df = pd.read_csv(original_csv_path)
    df['filename'] = df['filename'].str.replace(r'\.(png|jpeg|jpg)$', SPECTROGRAM_FILE_EXTENSION, regex=True)
    files_set = set(os.listdir(SPECTROGRAM_DIR))
    df = df[df['filename'].isin(files_set)].copy()
    label_cols = df.columns[1:]
    df['label_sum'] = df[label_cols].sum(axis=1)
    df = df[df['label_sum'] > 0].drop(columns=['label_sum'])
    df.to_csv(METADATA_FILE, index=False)
    print(f"✅ Data cleansed: {len(df)} samples, {len(label_cols)} labels")
    return label_cols

class SpectrogramDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform
        self.labels = self.df.columns[1:].tolist()
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx,0])
        img = Image.open(img_path).convert('RGB')
        label = self.df.iloc[idx,1:].values.astype(np.float32)
        if self.transform:
            img = self.transform(img)
        return img, torch.tensor(label)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(300, scale=(0.4, 1.0), ratio=(0.75, 1.33)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.ColorJitter(0.4, 0.4, 0.4, 0.15),
    transforms.GaussianBlur(kernel_size=7, sigma=(0.1, 2.5)),
    transforms.RandomErasing(p=0.8, scale=(0.02, 0.2), ratio=(0.2, 2)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3,1,1) if x.size(0) == 1 else x),
])
val_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225]),
    transforms.Lambda(lambda x: x.repeat(3,1,1) if x.size(0) == 1 else x),
])

def eval_metrics(val_targets, val_preds, label_names):
    from sklearn.metrics import f1_score, classification_report
    # Threshold sweep
    best_thresholds = []
    best_f1s = []
    for i in range(val_targets.shape[1]):
        best_f1, best_th = 0, 0.5
        for th in np.arange(0.1, 0.91, 0.01):
            f1 = f1_score(val_targets[:,i], (val_preds[:,i]>th).astype(int), zero_division=0)
            if f1 > best_f1:
                best_f1, best_th = f1, th
        best_thresholds.append(best_th)
        best_f1s.append(best_f1)
    print("Best F1/Threshold per class:")
    for n, f, t in zip(label_names, best_f1s, best_thresholds):
        print(f"{n:35} F1={f:.3f}  th={t:.2f}")
    # Apply per-class best thresholds
    final_preds = np.zeros_like(val_preds)
    for i, th in enumerate(best_thresholds):
        final_preds[:,i] = (val_preds[:,i] > th).astype(int)
    exact_match = np.all(final_preds == val_targets, axis=1).mean()
    weighted_f1 = f1_score(val_targets, final_preds, average='weighted', zero_division=0)
    micro_f1 = f1_score(val_targets, final_preds, average='micro', zero_division=0)
    print(f"\nExact-match accuracy: {exact_match:.4f} ({exact_match*100:.2f}%)")
    print(f"Weighted F1: {weighted_f1:.4f}")
    print(f"Micro F1: {micro_f1:.4f}\n")
    print("Classification report:\n")
    print(classification_report(val_targets, final_preds, target_names=label_names, zero_division=0))
    return exact_match, weighted_f1, micro_f1

def train():
    if not upload_and_extract_data():
        return
    labels = cleanse_metadata_file()
    dataset = SpectrogramDataset(METADATA_FILE, SPECTROGRAM_DIR, transform=None)
    train_len = int(len(dataset)*0.8)
    val_len = len(dataset) - train_len
    train_ds, val_ds = random_split(dataset, [train_len, val_len])
    train_ds.dataset.transform = train_transform
    val_ds.dataset.transform = val_transform
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    model = models.efficientnet_b3(pretrained=True)
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, len(labels))
    model = model.to(DEVICE)

    criterion = FocalLoss(gamma=2, alpha=0.5)
    optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=2e-4)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-4, steps_per_epoch=len(train_loader), epochs=EPOCHS_HEAD+EPOCHS_FULL)

    # Head training
    for param in model.features.parameters():
        param.requires_grad = False
    print("Training classifier head...")
    for epoch in range(EPOCHS_HEAD):
        model.train()
        total_loss = 0
        for imgs, targets in train_loader:
            imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item() * imgs.size(0)
        print(f"Head Epoch {epoch+1} Loss: {total_loss/len(train_loader.dataset):.4f}")

    # Phase 2: Fine-tune
    for param in model.features.parameters():
        param.requires_grad = True
    best_f1 = 0
    print("Fine-tuning entire model...")
    for epoch in range(EPOCHS_FULL):
        model.train()
        total_loss = 0
        for imgs, targets in train_loader:
            imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item() * imgs.size(0)
        print(f"Fine-tune Epoch {epoch+1} Loss: {total_loss/len(train_loader.dataset):.4f}")

        # Validation epoch
        model.eval()
        all_preds, all_targets = [], []
        with torch.no_grad():
            for imgs, targets in val_loader:
                imgs = imgs.to(DEVICE)
                outputs = model(imgs)
                preds = torch.sigmoid(outputs).cpu().numpy()
                all_preds.append(preds)
                all_targets.append(targets.numpy())
        all_preds = np.vstack(all_preds)
        all_targets = np.vstack(all_targets)
        pred_labels = (all_preds > 0.5).astype(int)
        weighted_f1 = f1_score(all_targets, pred_labels, average='weighted', zero_division=0)
        if weighted_f1 > best_f1:
            best_f1 = weighted_f1
            torch.save(model.state_dict(), 'best_focal_efficientnet_b3.pth')
            print(f"  (New best model saved, val Weighted F1: {weighted_f1:.4f})")

    # Final Evaluation with best thresholds
    print("Evaluating best model with sweeping thresholds:")
    model.load_state_dict(torch.load('best_focal_efficientnet_b3.pth'))
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs = imgs.to(DEVICE)
            outputs = model(imgs)
            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(preds)
            all_targets.append(targets.numpy())
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    eval_metrics(all_targets, all_preds, labels)

if __name__ == "__main__":
    train()

Upload your 'project_spectrogram_data.zip':


Saving project_spectrogram_data.zip to project_spectrogram_data.zip
✅ Extraction successful.
✅ Data cleansed: 980 samples, 24 labels




Downloading: "https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b3_rwightman-b3899882.pth


100%|██████████| 47.2M/47.2M [00:00<00:00, 210MB/s]


Training classifier head...
Head Epoch 1 Loss: 0.0856
Head Epoch 2 Loss: 0.0758
Head Epoch 3 Loss: 0.0671
Head Epoch 4 Loss: 0.0588
Head Epoch 5 Loss: 0.0517
Fine-tuning entire model...
Fine-tune Epoch 1 Loss: 0.0420
  (New best model saved, val Weighted F1: 0.1731)
Fine-tune Epoch 2 Loss: 0.0343
  (New best model saved, val Weighted F1: 0.1862)
Fine-tune Epoch 3 Loss: 0.0302
  (New best model saved, val Weighted F1: 0.2825)
Fine-tune Epoch 4 Loss: 0.0271
  (New best model saved, val Weighted F1: 0.3761)
Fine-tune Epoch 5 Loss: 0.0241
  (New best model saved, val Weighted F1: 0.4147)
Fine-tune Epoch 6 Loss: 0.0212
  (New best model saved, val Weighted F1: 0.4811)
Fine-tune Epoch 7 Loss: 0.0185
  (New best model saved, val Weighted F1: 0.4851)
Fine-tune Epoch 8 Loss: 0.0164
Fine-tune Epoch 9 Loss: 0.0136
  (New best model saved, val Weighted F1: 0.5401)
Fine-tune Epoch 10 Loss: 0.0116
Fine-tune Epoch 11 Loss: 0.0097
Fine-tune Epoch 12 Loss: 0.0078
Fine-tune Epoch 13 Loss: 0.0070
  (New 

Focal Loss: Helps model learn rare/hard labels and focus on "hard" examples.

EfficientNet-B3: Highly expressive model, superior to VGG/AlexNet/ResNet18.

Aggressive augmentations/training schedule: Maximizes potential on your dataset size.

Threshold sweeping: Finds the best cut-off per label for true maximum F1/accuracy, not limited to fixed 0.5.

Early stopping/checkpointing: Only keeps the best weights.