In [1]:
import pandas as pd
from glob import glob
import os


data_dir = "AudioWAV"


all_audio_files = glob(os.path.join(data_dir, "*.wav"))

# Define emotions based on filename (CREMA-D format)
emotion_dict = {
    "SAD": 0, "ANG": 1, "DIS": 2, "FEA": 3, "HAP": 4, "NEU": 5
}

# Extract labels from filenames (e.g., "1001_IEO_SAD_HI.wav")
data = []
for file in all_audio_files:
    filename = os.path.basename(file)
    parts = filename.split("_")
    if len(parts) > 2:
        emotion_label = emotion_dict.get(parts[2])  # Extracting emotion
        if emotion_label is not None:
            data.append({"path": file, "emotion": emotion_label})

# Convert to a DataFrame
df = pd.DataFrame(data)


In [2]:
df

Unnamed: 0,path,emotion
0,AudioWAV\1001_DFA_ANG_XX.wav,1
1,AudioWAV\1001_DFA_DIS_XX.wav,2
2,AudioWAV\1001_DFA_FEA_XX.wav,3
3,AudioWAV\1001_DFA_HAP_XX.wav,4
4,AudioWAV\1001_DFA_NEU_XX.wav,5
...,...,...
7437,AudioWAV\1091_WSI_DIS_XX.wav,2
7438,AudioWAV\1091_WSI_FEA_XX.wav,3
7439,AudioWAV\1091_WSI_HAP_XX.wav,4
7440,AudioWAV\1091_WSI_NEU_XX.wav,5


In [5]:
!pip install transformers datasets librosa soundfile


Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
Installing collected packages: datasets
Successfully installed datasets-3.5.0



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading transformers-4.50.3-py3-none-any.whl (10.2 MB)
   ---------------------------------------- 0.0/10.2 MB ? eta -:--:--
    --------------------------------------- 0.2/10.2 MB 5.3 MB/s eta 0:00:02
   -- ------------------------------------- 0.5/10.2 MB 8.0 MB/s eta 0:00:02
   --- ------------------------------------ 0.8/10.2 MB 8.5 MB/s eta 0:00:02
   --- ------------------------------------ 0.9/10.2 MB 5.8 MB/s eta 0:00:02
   ------ --------------------------


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [38]:
import librosa

def load_audio(examples):
    audio, sr = librosa.load(examples, sr=16000)  
    return audio




In [39]:
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/wav2vec2-base"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6,
    label2id=emotion_dict,
    id2label={v: k for k, v in emotion_dict.items()},
).to(device)  # Move model to GPU


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
def preprocess_function(examples):
    audio = load_audio(examples["path"])
    inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

    return {
        "input_values": torch.tensor(inputs["input_values"].squeeze(0)),  
        "labels": torch.tensor(examples["emotion"], dtype=torch.int32) 
    }



# Apply preprocessing
dataset = dataset.map(preprocess_function,remove_columns=["path"])

Map:   0%|          | 0/7442 [00:00<?, ? examples/s]

  "input_values": torch.tensor(inputs["input_values"].squeeze(0)),


In [41]:
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
test_dataset = train_test["test"]


In [17]:
!pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
   ---------------------------------------- 0.0/354.7 kB ? eta -:--:--
   ------------------------- -------------- 225.3/354.7 kB 6.9 MB/s eta 0:00:01
   ---------------------------------------- 354.7/354.7 kB 5.6 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
pip install --upgrade accelerate


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import torch
import transformers

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Using device:", "GPU" if torch.cuda.is_available() else "CPU")


Torch version: 2.6.0+cu126
Transformers version: 4.50.3
Using device: GPU


In [42]:
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score


def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)  
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


training_args = TrainingArguments(
    output_dir="./wav2vec2-emotion",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  
)




In [43]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [44]:
trainer.train()

RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'