# Project:
This notebook is being written by DPAG as part of the DFG-funded Research Project "Resolving the cognitive and neural basis of affective sound-meaning associations" under supervision of Dr. Arash Aryani, PostDoc researcher at FU Berlin.

Goal: We aim to fine-tune existing auditory DNNs in order to model and predict arousal & valence ratings from non-words.

## Models:
Current Model being tested is the XLRS-53 version of the wav2vec2 large model

Huggingface: https://huggingface.co/facebook/wav2vec2-large-xlsr-53

Short description: This model is a transformer-based model that learned speech representations on unlabeled data.

Why it's fitting for the project:

+ Pre-trained on shorter speech units than phonemes, this should make it so it's better for recognizing non-words compared to other models
+ There's literature on how the model layers effectively encode acoustic and phonetic information.

## Dataset:
Data utilized was gathered and consists of (data Arash sent me) - more TBA

### 1. Import libraries

In [None]:
import torch
import transformers
import torchaudio
import librosa
from torch import nn
import optuna
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from tqdm import tqdm
from datasets import load_dataset
import copy
import pickle

import os
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm  # Library for progress bars
from sklearn.model_selection import train_test_split # For dataloader split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy import stats

# Local functions from src
import sys
from pathlib import Path
root = os.path.abspath("..") # Go up to root folder

if root not in sys.path:
    sys.path.append(root)

from src.load_data import load_data
from src.task_utils import collate_fn, split_data
from src.train_test import train_with_validation, test_model, test_stats, model_comparison

In [None]:
# Set device = GPU // only needed for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Optional, for colab: enable CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Optional: enable more detailed CUDA error reporting
os.environ['TORCH_USE_CUDA_DSA'] = '1'

print("CUDA debugging enabled!")
print(f"Using device: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")
print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Dataset is saved in collab, change dir if needed

from google.colab import drive
drive.mount('/content/drive')

# Set base dir with data files
base_dir = "/content/drive/MyDrive/Arash Projects"

In [None]:
# Load and resample dataset list with load_data function

model_sr = 16000

data = load_data(base_dir, batch_size=1000, target_sr=model_sr)

# Access dictionary variables
waveforms = data["waveforms"]
valences = data["valences"]
arousals = data["arousals"]

### Define dataset and model classes

In [None]:
class SingleLabelDataset(torch.utils.data.Dataset):
    """
    Creates a dataset class called "Single Label Dataset" that takes in the waveforms
    and a chosen target (either valence or arousal)
    """
    def __init__(self, waveforms, targets):
        self.waveforms = waveforms
        self.targets = targets  # Single label (valence or arousal)

    def __len__(self):
        return len(self.waveforms)

    def __getitem__(self, idx):
        return self.waveforms[idx], torch.tensor(self.targets[idx], dtype=torch.float32)

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from datasets import load_dataset

 # Load pretrained model and processor
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

In [None]:
# Regressor head with flexible architecture

class OptimizedWav2Vec2Regression(nn.Module):
    def __init__(self, base_model, hidden_size, dropout_rate, num_layers=2):
        super().__init__()
        self.wav2vec2 = base_model

        # Build dynamic regressor based on trial parameters
        layers = []
        input_size = self.wav2vec2.config.hidden_size

        for i in range(num_layers - 1):
            layers.extend([
                nn.Linear(input_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            input_size = hidden_size

        # Final output layer
        layers.append(nn.Linear(input_size, 1))

        self.regressor = nn.Sequential(*layers)

    def forward(self, input_values, attention_mask=None):
        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.mean(dim=1)
        return self.regressor(pooled).squeeze(1)

### Training

In [None]:
# Define params for model 

# Best found parameters for arousal through earlier optimization were from V2 Trial 5

aro_params = {
    "learning_rate": 1.0351315184578603e-05,
    "batch_size": 4,
    "hidden_size": 128,
    "dropout_rate": 0.27555511385430487,
    "num_layers": 2,
    "weight_decay": 1.425475704402744e-05,
    "optimizer": "Adam",
    "scheduler": "step",
    "criterion": "SmoothL1Loss",
    "normalize_targets": True,
    "grad_clip": 1.5393422419156506,
}

# Create data loaders

arousal_loaders = split_data(
    waveforms,
    arousals,
    target_name="Arousal", 
    batch_size=aro_params["batch_size"], 
    collate_fn=collate_fn, 
    dataset_class=SingleLabelDataset)

# Build custom model

optimized_arousal_model = OptimizedWav2Vec2Regression(
    base_model=Wav2Vec2Model.from_pretrained(model_name),
    hidden_size=aro_params["hidden_size"],
    dropout_rate=aro_params["dropout_rate"],
    num_layers=aro_params["num_layers"]
).to(device)

# Criterion
criterion = torch.nn.SmoothL1Loss()

# Optimizer
optimizer = torch.optim.Adam(
    optimized_arousal_model.parameters(),
    lr=aro_params["learning_rate"],
    weight_decay=aro_params["weight_decay"]  # Adam supports weight_decay in PyTorch
)

# Scheduler
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,   # same best aro params 
    gamma=0.7
)

In [None]:
# Train arousal model

aro_train_losses, aro_val_losses = train_with_validation(
    model=optimized_arousal_model,
    train_dataloader=arousal_loaders["train"],
    val_dataloader=arousal_loaders["val"],
    feature_extractor=feature_extractor,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=criterion,
    device=device,
    num_epochs=40,
    sampling_rate=model_sr,
    trial=None,
    grad_clip=aro_params["grad_clip"],
    normalize_targets=aro_params["normalize_targets"],
    patience=3,
    min_delta=1e-4,
    variance_reg_coeff=0.1,
    freeze_backbone_epochs=2,
    verbose=True
)


### Testing

In [None]:
# Evaluating on test set

# Redefine criterion
criterion = torch.nn.SmoothL1Loss()

aro_test_loss, aro_r2, aro_mse, aro_mae, aro_preds, aro_targets = test_stats(
    model=optimized_arousal_model,
    test_dataloader=arousal_loaders["test"],
    feature_extractor=feature_extractor,
    criterion=criterion,
    device=device,
    sampling_rate=model_sr,
)

In [None]:
model_comparison(aro_targets, aro_preds, r2=aro_r2, mse=aro_mse, mae=aro_mae)

### Save model

In [None]:
# # Save final model - Optional
# aro_full_model = "/content/drive/MyDrive/Arash Projects/aro_full_model.pth"
# torch.save(optimized_arousal_model.state_dict(), aro_full_model)
# print(f"Retrained best arousal model saved at {aro_full_model}")