In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # Use only the first GPU
os.environ['NCCL_DEBUG']      = 'INFO'
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_SHM_DISABLE'] = '1'

import torch
import torch.distributed as dist



In [19]:
import numpy as np
import torchcodec
from datasets import load_dataset, Audio

# Preprocess

In [3]:
ravdess = load_dataset('amnesiackid/ravdess-emotion-intensity', name='default', split="train")

In [4]:
def decode_to_array(example):
    decoder = example["audio"]
    samples = decoder.get_all_samples()
    # Average the stereo channels to create a mono (1D) array
    waveform_mono = samples.data.numpy().mean(axis=0)
    return {
        "waveform": waveform_mono,          # NumPy array
        "sampling_rate": samples.sample_rate       # int
    }

ravdess = ravdess.map(
    decode_to_array,
    remove_columns=["audio"],                      # drop the original AudioDecoder
)

In [5]:
label2id = {"neutral": 0,
    "calm": 1,
    "happy": 2,
    "sad": 3,
    "angry": 4,
    "fearful": 5,
    "disgust": 6,
    "surprised": 7}
id2label = {v: k for k, v in label2id.items()}
def convert_labels(examples):
    # Convert string labels to numeric IDs
    n_emotion = [label2id[label] for label in examples["emotion_labels"]]
    return {"emotion_labels": n_emotion}
# First convert labels to numeric IDs
ravdess = ravdess.map(convert_labels, batched=True)

In [6]:
# ravdess = ravdess.train_test_split(test_size=0.2)

inspect data structure

In [7]:

ravdess[0]

{'emotion_labels': 5,
 'intensity': 'normal',
 'waveform': [-3.081597969867289e-05,
  -5.0026585086015984e-05,
  -4.7578698286088184e-05,
  -6.788996688555926e-05,
  -7.087022822815925e-05,
  -7.911308784969151e-05,
  -7.171534525696188e-05,
  -6.173657311592251e-05,
  -5.617045098915696e-05,
  -6.138924800325185e-05,
  -5.667846562573686e-05,
  -3.4311880881432444e-05,
  -5.049442552262917e-05,
  -5.1147089834557846e-05,
  -4.055574390804395e-05,
  -4.7103516408242285e-05,
  -5.125629832036793e-05,
  -6.474318070104346e-05,
  -5.636497007799335e-05,
  -5.0436105084372684e-05,
  -5.12894730491098e-05,
  -4.536351843853481e-05,
  -2.6783382054418325e-05,
  -4.044246452394873e-05,
  -4.3105825170641765e-05,
  -4.139556403970346e-05,
  -2.8001590180792846e-05,
  -3.693495818879455e-05,
  -4.306523624109104e-05,
  -4.0051250834949315e-05,
  -6.237836350919679e-05,
  -1.9999744836241007e-05,
  -4.3458028812892735e-05,
  -5.8324734709458426e-05,
  -4.049299604957923e-05,
  -3.762193227885291

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
wav2vec2.eval()  # eval mode



Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [9]:
import os
os.makedirs("checkpoints", exist_ok=True)


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [None]:
for i, item in enumerate(ravdess):
    waveform = torch.tensor(item['waveform'], dtype=torch.float32)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        feat = wav2vec2(inputs.input_values.to(device)).last_hidden_state.squeeze(0).cpu()
    torch.save(feat, f"features/{i}.pt")


RuntimeError: Parent directory features does not exist.

# Model training

In [None]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import LambdaLR




# 1. load wav2vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
wav2vec2.eval()  

# 2. custom dataset
class EmotionDataset(Dataset):
    def __init__(self, data, processor, wav2vec2):
        self.data = data
        self.processor = processor
        self.wav2vec2 = wav2vec2

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        waveform = torch.tensor(item['waveform'], dtype=torch.float32)
        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        with torch.no_grad():
            feats = self.wav2vec2(inputs.input_values.to(device)).last_hidden_state.squeeze(0)
        label = torch.tensor(item['emotion_labels'], dtype=torch.long)
        return feats.cpu(), label

def collate_fn(batch):
    feats = [x[0] for x in batch]
    labels = torch.stack([x[1] for x in batch])
    feats_padded = pad_sequence(feats, batch_first=True)  # (B, T_max, D)
    return feats_padded, labels

# 3. split data
dataset = EmotionDataset(ravdess, processor, wav2vec2)
n_total = len(dataset)
n_val = int(n_total * 0.2)      
n_train = n_total - n_val
train_ds, val_ds = random_split(dataset, [n_train, n_val])

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, collate_fn=collate_fn)
steps_per_epoch = len(train_loader)
# 4. define CNN classifier
class CNNClassifier(nn.Module):
    def __init__(self, feat_dim, n_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(feat_dim, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(0.1),
            nn.Conv1d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(256, n_classes)
        )

    def forward(self, x):
        x = x.transpose(1, 2)  # (B, T, D) → (B, D, T)
        return self.net(x)

n_classes = len({item['emotion_labels'] for item in ravdess})
model = CNNClassifier(feat_dim=wav2vec2.config.hidden_size, n_classes=n_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()
global_step = 0
epochs = 32
total_steps = epochs * steps_per_epoch
warmup_steps = int(0.05 * total_steps)

def lr_lambda(current_step):
    if current_step < warmup_steps:
        # linear warmup
        return current_step / float(max(1, warmup_steps))
    # cosine decay
    progress = (current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
    return 0.5 * (1.0 + math.cos(math.pi * progress))

scheduler = LambdaLR(optimizer, lr_lambda)

# 5. train and valudate

for epoch in range(epochs):
    # ——— train ———
    model.train()
    train_loss = 0.0
    train_correct = 0
    for feats, labels in train_loader:
        feats, labels = feats.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(feats)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        global_step += 1
        train_loss += loss.item() * feats.size(0)
        preds = logits.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
    train_loss /= n_train
    train_acc = train_correct / n_train

    # ——— validate ———
    model.eval()
    val_loss = 0.0
    val_correct = 0
    with torch.no_grad():
        for feats, labels in val_loader:
            feats, labels = feats.to(device), labels.to(device)
            logits = model(feats)
            loss = criterion(logits, labels)
            val_loss += loss.item() * feats.size(0)
            preds = logits.argmax(dim=1)
            val_correct += (preds == labels).sum().item()
    val_loss /= n_val
    val_acc = val_correct / n_val

    print(
        f"Epoch {epoch}/{epochs} — "
        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}"
    )


Epoch 0/32 — Train Loss: 1.9802, Train Acc: 0.2153 | Val Loss: 1.9244, Val Acc: 0.2465
Epoch 1/32 — Train Loss: 1.7963, Train Acc: 0.2891 | Val Loss: 1.7586, Val Acc: 0.2986
Epoch 2/32 — Train Loss: 1.5999, Train Acc: 0.3863 | Val Loss: 1.6855, Val Acc: 0.3403
Epoch 3/32 — Train Loss: 1.4355, Train Acc: 0.4497 | Val Loss: 1.5952, Val Acc: 0.3264
Epoch 4/32 — Train Loss: 1.3297, Train Acc: 0.4818 | Val Loss: 1.2691, Val Acc: 0.4931
Epoch 5/32 — Train Loss: 1.2034, Train Acc: 0.5382 | Val Loss: 1.2965, Val Acc: 0.4583
Epoch 6/32 — Train Loss: 1.1256, Train Acc: 0.5790 | Val Loss: 1.2778, Val Acc: 0.5069
Epoch 7/32 — Train Loss: 1.0260, Train Acc: 0.6241 | Val Loss: 1.2204, Val Acc: 0.5104
Epoch 8/32 — Train Loss: 0.9663, Train Acc: 0.6354 | Val Loss: 1.2355, Val Acc: 0.5312
Epoch 9/32 — Train Loss: 0.8852, Train Acc: 0.6606 | Val Loss: 1.1608, Val Acc: 0.5660
Epoch 10/32 — Train Loss: 0.8193, Train Acc: 0.6936 | Val Loss: 1.1818, Val Acc: 0.5451
Epoch 11/32 — Train Loss: 0.7709, Train Ac

In [None]:
# 6. save model
os.makedirs("checkpoints", exist_ok=True)
torch.save(model.state_dict(), "checkpoints/cnn_classifier.pth")   