# Config

In [15]:
# Paths and hyperparameters
UNPROCESSED_DIR = "/kaggle/input/cadenza/cadenza_clip1_data.train.v1.0/cadenza_data/train/unprocessed"
PROCESSED_DIR = "/kaggle/input/cadenza/cadenza_clip1_data.train.v1.0/cadenza_data/train/signals"
METADATA_FILE = "/kaggle/input/cadenza/cadenza_clip1_data.train.v1.0/cadenza_data/metadata/train_metadata.json"
OUTPUT_DIR = "/kaggle/working/output"

WAV2VEC_MODEL = "patrickvonplaten/tiny-wav2vec2-no-tokenizer"

MAX_AUDIO_LEN = 16000 * 10   # 15 sec max
BATCH_SIZE = 32
LR = 1e-4
EPOCHS = 5
VAL_SPLIT = 0.2

DEVICE = "cuda"


# Dataset

In [16]:
import json
import torch
import torchaudio
from torch.utils.data import Dataset

class CadenzaDataset(Dataset):
    def __init__(self, meta_file, clean_dir, processed_dir, max_len):
        with open(meta_file, "r") as f:
            self.records = json.load(f)
        self.clean_dir = clean_dir
        self.processed_dir = processed_dir
        self.max_len = max_len

    def _load_audio(self, path):
        wav, sr = torchaudio.load(path)
        wav = wav.mean(dim=0)  # mono
        # pad or crop
        if wav.shape[0] > self.max_len:
            wav = wav[:self.max_len]
        else:
            wav = torch.nn.functional.pad(wav, (0, self.max_len - wav.shape[0]))
        return wav

    def __getitem__(self, idx):
        r = self.records[idx]
        signal = r["signal"]
        correctness = torch.tensor(r["correctness"], dtype=torch.float32)

        # unprocessed file has _unproc.flac
        unproc_path = os.path.join(self.clean_dir, f"{signal}_unproc.flac")
        clean = self._load_audio(unproc_path)

        # processed file has just signal.flac
        proc_path = os.path.join(self.processed_dir, f"{signal}.flac")
        processed = self._load_audio(proc_path)

        return {
            "signal": signal,
            "clean": clean,
            "processed": processed,
            "correctness": correctness,
        }


    def __len__(self):
        return len(self.records)


# Model

In [17]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model

class SiameseWav2Vec2(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.encoder = Wav2Vec2Model.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size
        self.reg_head = nn.Sequential(
            nn.Linear(hidden, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()  # output 0–1 intelligibility
        )

    def encode(self, wav):
        out = self.encoder(wav).last_hidden_state
        pooled = out.mean(dim=1)
        return pooled

    def forward(self, clean, processed):
        c = self.encode(clean)
        p = self.encode(processed)
        diff = torch.abs(c - p)
        pred = self.reg_head(diff)
        return pred.squeeze(1)


# Training

In [18]:
import os
import random
from torch.utils.data import DataLoader, random_split, Subset
from tqdm import tqdm
import torch

# Split dataset into 80% train / 20% eval
dataset = CadenzaDataset(METADATA_FILE, UNPROCESSED_DIR, PROCESSED_DIR, MAX_AUDIO_LEN)
total_samples = len(dataset)
indices = list(range(total_samples))
random.shuffle(indices)

train_cutoff = int(0.8 * total_samples)
train_indices = indices[:train_cutoff]
eval_indices  = indices[train_cutoff:]

train_ds = Subset(dataset, train_indices)
eval_ds  = Subset(dataset, eval_indices)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
eval_loader  = DataLoader(eval_ds, batch_size=BATCH_SIZE, shuffle=False)

# Model
model = SiameseWav2Vec2(WAV2VEC_MODEL).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = torch.nn.MSELoss()

# Training loop
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    model.train()
    for batch in tqdm(train_loader, desc="Training"):
        clean = batch["clean"].to(DEVICE)
        processed = batch["processed"].to(DEVICE)
        y = batch["correctness"].to(DEVICE)

        optimizer.zero_grad()
        preds = model(clean, processed)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

# Save trained model
os.makedirs(OUTPUT_DIR, exist_ok=True)
torch.save(model.state_dict(), f"{OUTPUT_DIR}/siamese_wav2vec2.pt")
print("Model saved.")



Epoch 1/5


Training: 100%|██████████| 221/221 [10:07<00:00,  2.75s/it]



Epoch 2/5


Training: 100%|██████████| 221/221 [08:44<00:00,  2.37s/it]



Epoch 3/5


Training: 100%|██████████| 221/221 [08:41<00:00,  2.36s/it]



Epoch 4/5


Training: 100%|██████████| 221/221 [08:43<00:00,  2.37s/it]



Epoch 5/5


Training: 100%|██████████| 221/221 [08:46<00:00,  2.38s/it]

Model saved.





# Predictions

In [19]:
import csv

# Load trained model
model = SiameseWav2Vec2(WAV2VEC_MODEL).to(DEVICE)
model.load_state_dict(torch.load(f"{OUTPUT_DIR}/siamese_wav2vec2.pt"))
model.eval()

pred_file = f"{OUTPUT_DIR}/predictions.csv"
with open(pred_file, "w", newline="") as f:
    writer = csv.writer(f)
    with torch.no_grad():
        for batch in tqdm(eval_loader, desc="Predicting"):
            clean = batch["clean"].to(DEVICE)
            processed = batch["processed"].to(DEVICE)
            preds = model(clean, processed).cpu().numpy()
            for signal, p in zip(batch["signal"], preds):
                writer.writerow([signal, float(p)])

print("Predictions saved:", pred_file)


Predicting: 100%|██████████| 56/56 [01:03<00:00,  1.14s/it]

Predictions saved: /kaggle/working/output/predictions.csv





# Evaluation

In [20]:
import json
import pandas as pd
from scipy.stats import kendalltau, pearsonr
import numpy as np

def rmse(a, b):
    return np.sqrt(np.mean((a - b) ** 2))

def stderr(a, b):
    return np.std(a - b) / np.sqrt(len(a))

def ncc(a, b):
    return pearsonr(a, b)[0]

def kt(a, b):
    return kendalltau(a, b)[0]

# Load metadata
with open(METADATA_FILE, "r") as f:
    rec = json.load(f)
gt = {r["signal"]: r["correctness"] for r in rec}

# Load predictions
df = pd.read_csv(f"{OUTPUT_DIR}/predictions.csv", names=["signal","pred"])
y_true = np.array([gt[s] for s in df.signal])
y_pred = df.pred.values

scores = {
    "RMSE": rmse(y_pred, y_true),
    "Std": stderr(y_pred, y_true),
    "NCC": ncc(y_pred, y_true),
    "KT": kt(y_pred, y_true),
}

print(json.dumps(scores, indent=2))

with open(f"{OUTPUT_DIR}/evaluation.json", "w") as f:
    json.dump(scores, f, indent=2)

print("Evaluation saved.")


{
  "RMSE": 0.3369441922418843,
  "Std": 0.008028022958211305,
  "NCC": 0.32272109534203897,
  "KT": 0.23866481732366823
}
Evaluation saved.
