In [6]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoModelForAudioClassification, AutoProcessor, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import os
import sys
import ast # For parsing string representations of lists/arrays
import logging
import time
from sklearn.metrics import hamming_loss, jaccard_score, f1_score, roc_auc_score
from tqdm.notebook import tqdm # Use notebook version of tqdm

# --- Project Setup ---
# Detect if running in notebook or script to adjust path

cwd = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(cwd, '../../')) # NOTE: remember to change if change the directory structure



print(f"PROJECT_ROOT detected as: {PROJECT_ROOT}")
# Add project root to Python's module search path if necessary
if PROJECT_ROOT not in sys.path:
    print(f"Adding {PROJECT_ROOT} to sys.path")
    sys.path.append(PROJECT_ROOT)

import config # Import your configuration file

# --- Setup Logging ---
# Clear previous handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
# Basic logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])

print("Imports and basic setup complete.")

/home/zhuoyuan/CSprojects/musicClaGen
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
PROJECT_ROOT detected as: /home/zhuoyuan/CSprojects/musicClaGen
Imports and basic setup complete.


In [7]:
# --- Load Config ---
manifest_path = config.PATHS['MANIFEST_PATH']
genre_list_path = config.PATHS['GENRE_LIST_PATH']
model_save_dir = config.PATHS['MODELS_DIR']
features_dir = config.PATHS['FMA_FEATURES_DIR'] # Needed if dataset uses it relative

model_checkpoint = config.MODEL_PARAMS['model_checkpoint']
learning_rate = config.MODEL_PARAMS['learning_rate']
batch_size = config.MODEL_PARAMS['batch_size'] # Use the small BS for notebook test
num_epochs = 1 # <<<--- RUN ONLY 1 EPOCH FOR DEBUGGING ---<<<
weight_decay = config.MODEL_PARAMS['weight_decay']
gradient_accumulation_steps = config.MODEL_PARAMS['gradient_accumulation_steps']

# --- Load unified genre list to get num_labels ---
try:
    with open(genre_list_path, 'r') as f:
        unified_genres = [line.strip() for line in f if line.strip()]
    num_labels = len(unified_genres)
    logging.info(f"Loaded {num_labels} unified genres from {genre_list_path}")
    # print("Unified Genres:", unified_genres) # Uncomment to verify list
except Exception as e:
    logging.error(f"Failed to load or process unified genre list: {e}", exc_info=True)
    raise SystemExit("Cannot proceed without genre list.")

# --- Setup Device ---
device = torch.device(config.DEVICE if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")
if not torch.cuda.is_available() and config.DEVICE=="cuda":
     logging.warning("CUDA selected but not available, falling back to CPU.")

# --- Create Save Directory ---
os.makedirs(model_save_dir, exist_ok=True)   

2025-05-02 12:19:38,392 - INFO - Loaded 22 unified genres from /home/zhuoyuan/CSprojects/musicClaGen/data/processed/unified_genres.txt
2025-05-02 12:19:38,395 - INFO - Using device: cuda


In [8]:
# --- Dataset Class ---
# (Copied from previous response - Ensure this class is defined here or imported)
# Make sure it uses config.PROJECT_ROOT correctly for paths if needed

class FMAFeatureDataset(Dataset):
    """Loads pre-computed Mel-spectrograms and labels from manifest."""
    def __init__(self, manifest_path, processor=None):
        logging.info(f"Loading manifest from: {manifest_path}")
        try:
            self.manifest = pd.read_csv(manifest_path)
            # Parse the label vector string back into a list
            # Use ast.literal_eval if saved as pure list string '[0.0, 1.0]'
            # Use your custom parse_numpy_array_string if saved as '[np.float32(0.0)...]'
            # Adjust parser function based on how small_subset_multihot.csv looks
            label_parser = ast.literal_eval # Or your custom function
            self.manifest['label_vector'] = self.manifest['label_vector'].apply(label_parser)
            logging.info(f"Loaded manifest with {len(self.manifest)} entries.")
        except Exception as e:
            logging.error(f"Error loading or parsing manifest {manifest_path}: {e}", exc_info=True)
            raise
        self.processor = processor

    def __len__(self):
        return len(self.manifest)

    def __getitem__(self, idx):
        if torch.is_tensor(idx): idx = idx.tolist()
        row = self.manifest.iloc[idx]
        track_id = row['track_id']
        # Construct absolute path from project root + relative path in manifest
        feature_file_path = os.path.join(config.PROJECT_ROOT, row['feature_path'])

        try:
            spectrogram = np.load(feature_file_path).astype(np.float32) # Load .npy

            # Apply Processor (if applicable) - CHECK DOCUMENTATION!
            if self.processor:
                processed = self.processor(
                    spectrogram, # Or maybe requires list? check docs
                    sampling_rate=config.PREPROCESSING_PARAMS["sample_rate"],
                    return_tensors="pt"
                    # Add other relevant processor args
                )
                # Adjust key based on actual processor output
                feature_tensor = processed.get('input_features', processed.get('input_values')).squeeze(0)
                attention_mask = processed.get('attention_mask', None)
                if attention_mask is not None: attention_mask = attention_mask.squeeze(0)
            else:
                # Basic tensor conversion if no processor needed/used
                feature_tensor = torch.tensor(spectrogram)
                attention_mask = None # No mask generated

            label_vector = row['label_vector']
            label_tensor = torch.tensor(label_vector, dtype=torch.float32)

            # Return dictionary matching model's expected input names
            inputs = {"input_values": feature_tensor, "labels": label_tensor}
            if attention_mask is not None: inputs['attention_mask'] = attention_mask

            return inputs

        except Exception as e:
            logging.error(f"Error loading/processing track {track_id} at {feature_file_path}: {e}", exc_info=True)
            raise

In [10]:
print(model_checkpoint)

facebook/w2v-bert-2.0


In [11]:
# --- Load Processor (if needed) ---
try:
    processor = AutoProcessor.from_pretrained(model_checkpoint)
    logging.info(f"Loaded processor: {model_checkpoint}")
except Exception as e:
    logging.warning(f"Could not load processor. Proceeding without. Error: {e}")
    processor = None

# --- Create Full Dataset ---
try:
    full_dataset = FMAFeatureDataset(manifest_path, processor)
    manifest_df = full_dataset.manifest
except Exception as e:
     logging.error("Failed to instantiate FMAFeatureDataset.", exc_info=True)
     raise SystemExit

# --- Create SMALLER DEBUG Datasets ---
logging.info("Creating DEBUG DataLoaders with small subsets...")
try:
    # Get small number of samples from each split
    train_indices = manifest_df[manifest_df['split'] == 'training'].index[:16] # e.g., 16 train samples
    val_indices = manifest_df[manifest_df['split'] == 'validation'].index[:8]  # e.g., 8 val samples
    test_indices = manifest_df[manifest_df['split'] == 'test'].index[:8]   # e.g., 8 test samples

    debug_train_dataset = Subset(full_dataset, train_indices)
    debug_val_dataset = Subset(full_dataset, val_indices)
    # debug_test_dataset = Subset(full_dataset, test_indices) # Can create if needed

    # Use configured batch size, even if small
    debug_train_dataloader = DataLoader(debug_train_dataset, batch_size=batch_size, shuffle=True)
    debug_val_dataloader = DataLoader(debug_val_dataset, batch_size=batch_size, shuffle=False)
    logging.info(f"DEBUG Dataset sizes: Train={len(debug_train_dataset)}, Val={len(debug_val_dataset)}")
    logging.info("DEBUG DataLoaders created.")
except Exception as e:
    logging.error(f"Failed to create DEBUG datasets/dataloaders: {e}", exc_info=True)
    raise SystemExit

2025-05-02 12:21:17,469 - INFO - Loading manifest from: /home/zhuoyuan/CSprojects/musicClaGen/data/processed/final_feature_manifest.csv




2025-05-02 12:21:17,612 - INFO - Loaded manifest with 7994 entries.
2025-05-02 12:21:17,613 - INFO - Creating DEBUG DataLoaders with small subsets...
2025-05-02 12:21:17,618 - INFO - DEBUG Dataset sizes: Train=16, Val=8
2025-05-02 12:21:17,618 - INFO - DEBUG DataLoaders created.


# **!!!! OH NO!!!! While the documentation says that the input is mel spectrogram, the model actually takes raw audio as input. So we need to convert the mel spectrogram back to raw audio!!!!  Need to re-clean the data...**

# **Let's pass Wav2Vec2-BERT model because of the 24 hour the time constraint. Switch to AST! Come back when we have time.** 