In [1]:
!pip install datasets
!pip install transformers

[0m

In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoConfig, Wav2Vec2Processor
import torchaudio
import torch

In [3]:
import pathlib 
import subprocess
import IPython
import pandas as pd
import librosa

In [4]:
base_dir = pathlib.Path('../input/vgaf-dataset')

In [5]:
train_path = base_dir/'Train_VGAF_audio-20220823T161233Z-001/Train_VGAF_audio'
val_path = base_dir/'Val_VGAF_audio-20220823T161237Z-001/Val_VGAF_audio'

In [6]:
classes = ['Positive', 'Neutral', 'Negative']
class_to_idx = {'Positive': 1, 'Neutral': 2, 'Negative': 3}
idx_to_class = {1: 'Positive', 2: 'Neutral', 3: 'Negative'}

# Audio extraction 

In [None]:
audio_path = train_path.with_name('Train_VGAF_audio')
print(audio_path)
for video_path in train_path.iterdir():
    #print(video_path)
    subprocess.run(f'ffmpeg -i {video_path} -f wav -ab 192000 -ar 16000 -vn {audio_path/pathlib.Path(video_path.name).with_suffix(".wav")}', shell=True, check=True)
    # print(audio_path/pathlib.Path(video_path.name).with_suffix('.wav'))

In [None]:
audio_path = val_path.with_name('Val_VGAF_audio')
print(audio_path)
for video_path in val_path.iterdir():
    print(video_path)
    subprocess.run(f'ffmpeg -i {video_path} -f wav -ab 192000 -ar 16000 -vn {audio_path/pathlib.Path(video_path.name).with_suffix(".wav")}', shell=True, check=True)
    # print(audio_path/pathlib.Path(video_path.name).with_suffix('.wav'))

In [None]:
dict_to_df_val = {'audio': [] ,'label': []} 
for emotion_name in emotions_list: 
    test_videos_path = val_path/emotion_name
    audio_path = val_path.with_name('Val_AFEW_audio')/emotion_name
    curr_emotion = audio_path.name
    for audio in audio_path.iterdir():
        dict_to_df_val['audio'].append(curr_emotion + '/' + audio.name)
        dict_to_df_val['label'].append(class_to_idx.get(curr_emotion))  

# Wav2Vec2

In [7]:
data_files = {
    "train": "../input/5-folds-vgaf/5_train_fold.txt", 
    "test": "../input/5-folds-vgaf/5_test_fold.txt",
    "validation": "../input/vgaf-dataset/Val_labels.txt",
}

dataset = load_dataset("csv", data_files=data_files, delimiter=" ", )
train_dataset = dataset["train"]
test_dataset = dataset["test"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(test_dataset)
print(eval_dataset)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-097bf2f1ed0d3fc7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-097bf2f1ed0d3fc7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['Vid_name', 'Label'],
    num_rows: 2129
})
Dataset({
    features: ['Vid_name', 'Label'],
    num_rows: 532
})
Dataset({
    features: ['Vid_name', 'Label'],
    num_rows: 766
})


In [8]:
# We need to specify the input and output column
input_column = "Vid_name"
output_column = "Label"

In [9]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 3 classes: [0, 1, 2]


In [10]:
model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
pooling_mode = "mean"

In [11]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=class_to_idx,
    id2label=idx_to_class,
    finetuning_task="wav2vec2_clf",    
)
setattr(config, 'pooling_mode', pooling_mode)

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

In [12]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/300 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The target sampling rate: 16000


In [13]:
def speech_file_to_array_fn(path):
    path += '.wav'
    #print(train_path/path)
    try:
      speech_array, sampling_rate = librosa.load(train_path/path, sr = 16000)
    except FileNotFoundError:
      speech_array, sampling_rate = librosa.load(val_path/path, sr = 16000)
    # resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = speech_array
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):   
    # print(target_list)
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    #print(speech_list)
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]
    # print(len(speech_list))
    
    # print(speech_list[0].shape)
    result = processor(speech_list, sampling_rate=target_sampling_rate,  max_length=60000, padding="max_length", truncation='longest_first', return_attention_mask=True)
    result["labels"] = list(target_list)
    # print(result)
    return result

In [14]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=30,
    batched=True,
    num_proc=4
)


      

#0:   0%|          | 0/18 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/18 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/18 [00:00<?, ?ba/s]

#3:   0%|          | 0/18 [00:00<?, ?ba/s]

In [None]:
# idx = 0
# print(f"Training input_values: {train_dataset[idx]['input_values']}")
# print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
# print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['Label']}")

In [15]:
test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=30,
    batched=True,
    num_proc=4
)


      

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/5 [00:00<?, ?ba/s]

#3:   0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=30,
    batched=True,
    num_proc=4
)

In [16]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

model = AutoModelForAudioClassification.from_pretrained(
    model_name_or_path, 
    num_labels=num_labels,
    label2id=class_to_idx,
    id2label=idx_to_class,
)

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['projector.bias', 'classifier.weight', 'projector.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task 

In [17]:
model_name = model_name_or_path.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=10,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=10,
    num_train_epochs=8,
    warmup_ratio=0.5,
    logging_steps=10,
    save_total_limit = 2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

In [18]:
metric = load_metric("accuracy")

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [19]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: Label, Vid_name. If Label, Vid_name are not expected by `Wav2Vec2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2129
  Num Epochs = 8
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 2
  Total optimization steps = 848
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
0,1.0455,1.027216,0.477444
1,0.9366,1.006037,0.526316
2,0.7713,1.009316,0.537594
3,0.7768,1.143799,0.522556
4,0.6123,1.423424,0.509398
5,0.4885,1.611376,0.528195
6,0.2448,1.826059,0.488722
7,0.1496,1.937706,0.481203


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: Label, Vid_name. If Label, Vid_name are not expected by `Wav2Vec2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 532
  Batch size = 10
Saving model checkpoint to wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-106
Configuration saved in wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-106/config.json
Model weights saved in wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-106/pytorch_model.bin
Feature extractor saved in wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-106/preprocessor_config.json
tokenizer config file saved in wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-106/tokenizer_config.json
Special tokens file saved in wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-106/special_tokens_map.json
The following columns in th

TrainOutput(global_step=848, training_loss=0.6026811416981355, metrics={'train_runtime': 2894.9706, 'train_samples_per_second': 5.883, 'train_steps_per_second': 0.293, 'total_flos': 1.9347095845774802e+18, 'train_loss': 0.6026811416981355, 'epoch': 8.0})

In [22]:
import shutil
shutil.make_archive('w2v2_l_5f_0.53', 'zip', 'wav2vec2-large-xlsr-53-english-finetuned-ks/checkpoint-318')

'/kaggle/working/w2v2_l_5f_0.53.zip'