In [1]:
!pip install datasets==1.14
!pip install transformers==4.11.3



In [2]:
import pathlib 
import subprocess
import IPython
import pandas as pd
import librosa

In [3]:
base_dir = pathlib.Path('/content/drive/MyDrive/sum_paper/AFEW')

In [4]:
train_path = base_dir/'Train_AFEW_audio'
val_path = base_dir/'Val_AFEW_audio'

# Audio extraction 

In [5]:
emotions_list = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
class_to_idx = {'Angry': 0, 'Disgust': 1, 'Fear': 2, 'Happy': 3, 'Neutral': 4, 'Sad': 5, 'Surprise': 6}
idx_to_class = {0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Neutral', 5: 'Sad', 6: 'Surprise'}

In [6]:
# IPython.display.Audio('/content/test.wav')

In [7]:
# for emotion_name in emotions_list:
#   val_videos_path = val_path/emotion_name
#   audio_path = val_path.with_name('Val_AFEW_audio')/emotion_name
#   # print(audio_path)
#   for video_path in val_videos_path.iterdir():
#     subprocess.run(f'ffmpeg -i {video_path} -f wav -ab 192000 -ar 16000 -vn {audio_path/pathlib.Path(video_path.name).with_suffix(".wav")}', shell=True, check=True)
#     # print(audio_path/pathlib.Path(video_path.name).with_suffix('.wav'))

In [8]:
# dict_to_df_val = {'audio': [] ,'label': []} 
# for emotion_name in emotions_list: 
#   test_videos_path = val_path/emotion_name
#   audio_path = val_path.with_name('Val_AFEW_audio')/emotion_name
#   curr_emotion = audio_path.name
#   for audio in audio_path.iterdir():
#     dict_to_df_val['audio'].append(curr_emotion + '/' + audio.name)
#     dict_to_df_val['label'].append(class_to_idx.get(curr_emotion))  

In [9]:
# dict_to_df_train = {'audio': [] ,'label': []} 
# for emotion_name in emotions_list: 
#   train_videos_path = train_path/emotion_name
#   audio_path = train_path.with_name('Train_AFEW_audio')/emotion_name
#   curr_emotion = audio_path.name
#   for audio in audio_path.iterdir():
#     dict_to_df_train['audio'].append(curr_emotion + '/' + audio.name)
#     dict_to_df_train['label'].append(class_to_idx.get(curr_emotion))  

In [10]:
# train_data = pd.DataFrame.from_dict(dict_to_df_train)
# train_data.to_csv('/content/drive/MyDrive/sum_paper/AFEW/Train_AFEW_audio/train.csv', index=False)

In [11]:
# test_data = pd.DataFrame.from_dict(dict_to_df_val)
# test_data.to_csv('/content/drive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/test.csv', index=False)

#Wav2Vec2

In [12]:
import torchaudio
import torch
from datasets import load_dataset, load_metric
from transformers import AutoConfig, Wav2Vec2Processor

In [13]:
data_files = {
    "train": "/content/drive/MyDrive/sum_paper/AFEW/Train_AFEW_audio/train.csv", 
    "validation": "/content/drive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter=",", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Using custom data configuration default-dde72b2a7e211cb5
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-dde72b2a7e211cb5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'label'],
    num_rows: 773
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 383
})


In [14]:
pd.read_csv('/content/drive/MyDrive/sum_paper/AFEW/Val_AFEW_audio/test.csv')

Unnamed: 0,audio,label
0,Angry/000149120.wav,0
1,Angry/000316560.wav,0
2,Angry/000929080.wav,0
3,Angry/000551374.wav,0
4,Angry/001108440.wav,0
...,...,...
378,Surprise/011329120.wav,6
379,Surprise/012246840.wav,6
380,Surprise/015646720.wav,6
381,Surprise/014527960.wav,6


In [15]:
# We need to specify the input and output column
input_column = "audio"
output_column = "label"

In [16]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 7 classes: [0, 1, 2, 3, 4, 5, 6]


In [17]:
model_name_or_path = "facebook/wav2vec2-base-960h"
pooling_mode = "mean"

In [18]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=class_to_idx,
    id2label=idx_to_class,
    finetuning_task="wav2vec2_clf",    
)
setattr(config, 'pooling_mode', pooling_mode)

In [19]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [20]:
def speech_file_to_array_fn(path):
    # print(train_path/path)
    try:
      speech_array, sampling_rate = librosa.load(train_path/path, sr = 16000)
    except FileNotFoundError:
      speech_array, sampling_rate = librosa.load(val_path/path, sr = 16000)
    # resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = speech_array
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):   
    # print(target_list)
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
    # print(len(speech_list))
    
    # print(speech_list[0].shape)
    result = processor(speech_list, sampling_rate=target_sampling_rate,  max_length=60000, padding="max_length", truncation='longest_first', return_attention_mask=True)
    result["labels"] = list(target_list)
    # print(result)
    return result

In [21]:
# preprocess_function(dataset['train'][:2])

In [22]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-dde72b2a7e211cb5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-222a9c94e8fc9f70.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-dde72b2a7e211cb5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-e806eef011835dd7.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-dde72b2a7e211cb5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-f5900cc11cbce33d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-dde72b2a7e211cb5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-9abba206c0526700.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-dde72b2a7e211cb5/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-bb7474a5fc388476.arrow
Loadi

In [23]:
# idx = 0
# print(f"Training input_values: {train_dataset[idx]['input_values']}")
# print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
# print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['label']}")

In [24]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = 6
model = AutoModelForAudioClassification.from_pretrained(
    model_name_or_path, 
    num_labels=num_labels,
    label2id=class_to_idx,
    id2label=idx_to_class,
)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed', 'projector.weight', 'projector.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be 

In [25]:
model_name = model_name_or_path.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=20,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=20,
    num_train_epochs=20,
    warmup_ratio=0.5,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

ValueError: ignored

In [None]:
metric = load_metric("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()