In [1]:
import numpy as np
from pathlib import Path

import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import Dataset, Audio, load_from_disk
import evaluate

from scripts.tools import load_pickle

import matplotlib.pyplot as plt
from typing import Tuple, List, Dict
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_PATH = Path('../data/')
random_seed = 54

In [2]:
target_dict = load_pickle(DATA_PATH / "humor_label_sdk.pkl")
audio = [f'../data/urfunny2_audio/{str(idx)}.mp3' for idx in target_dict.keys()]
target_dict = load_pickle(DATA_PATH / "humor_label_sdk.pkl")
target = list(target_dict.values())

audio_ds = Dataset.from_dict({'audio' : audio, 'label': target}).cast_column("audio", Audio(sampling_rate=16_000))

In [3]:
audio_ds = audio_ds.train_test_split(test_size=0.3, seed=random_seed)

In [4]:
audio_ds['train'][0]

{'audio': {'path': '../data/urfunny2_audio/10927.mp3',
  'array': array([-2.36541428e-06, -1.08365850e-06,  6.93597394e-06, ...,
         -9.15424898e-03, -1.50176473e-02, -1.05204303e-02]),
  'sampling_rate': 16000},
 'label': 1}

In [5]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

def preprocess_function(wave):
    audio_arrays = [x["array"] for x in wave["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=16000,
        truncation=True,
        padding=True,
    )
    return inputs



In [6]:
encoded_audio_ds = audio_ds.map(preprocess_function, remove_columns="audio", batched=True, batch_size=10)

Map:   0%|          | 0/7116 [00:00<?, ? examples/s]

Map:   0%|          | 0/3050 [00:00<?, ? examples/s]

In [6]:
#encoded_audio_ds.save_to_disk('../data/encoded_audio_ds')
encoded_audio_ds = load_from_disk('../data/encoded_audio_ds')

In [7]:
label2id = {"HUMOR": 1, "NONHUMOR": 0}
id2label = {0: "NONHUMOR", 1: "HUMOR"}

In [8]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [9]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=2,
    label2id=label2id,
    id2label=id2label,
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_q.bias', 'project_q.weight', 'project_hid.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.weight', 'projecto

In [12]:
training_args = TrainingArguments(
    output_dir="../models/audio_wav2vec",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_audio_ds["train"],
    eval_dataset=encoded_audio_ds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss


TrainOutput(global_step=3330, training_loss=0.6934748755560981, metrics={'train_runtime': 1570.4944, 'train_samples_per_second': 67.966, 'train_steps_per_second': 2.12, 'total_flos': 9.6691069069056e+17, 'train_loss': 0.6934748755560981, 'epoch': 14.97})

In [23]:
predicts = trainer.predict(encoded_audio_ds['test'])

In [24]:
predicts

PredictionOutput(predictions=array([[-0.00502865,  0.03169612],
       [-0.0061147 ,  0.03137627],
       [-0.00586031,  0.03213147],
       ...,
       [-0.00605541,  0.03173213],
       [-0.00599126,  0.03191714],
       [-0.00565508,  0.03182648]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.6929964423179626, 'test_accuracy': 0.5088524590163934, 'test_runtime': 18.0048, 'test_samples_per_second': 169.4, 'test_steps_per_second': 10.608})

In [27]:
torch.save(predicts[1], 'true_labels.pt')

In [15]:
labels = predicts[1]

In [16]:
predicted_probs = torch.softmax(torch.tensor(predicts[0]), -1)
predicted_classes = torch.argmax(predicted_probs, -1)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

print(f"Accuracy: {accuracy_score(labels, predicted_classes)}\n")
print(f"Precision: {precision_score(labels, predicted_classes)}\n")
print(f"Recall: {recall_score(labels, predicted_classes)}\n")
cnf = confusion_matrix(labels, predicted_classes)
cnf_disp = ConfusionMatrixDisplay(confusion_matrix=cnf, display_labels=['No humor', 'Humor'])
cnf_disp.plot()

In [18]:
predicted_probs

tensor([[0.4908, 0.5092],
        [0.4906, 0.5094],
        [0.4905, 0.5095],
        ...,
        [0.4906, 0.5094],
        [0.4905, 0.5095],
        [0.4906, 0.5094]])

In [19]:
torch.save(predicted_probs, 'predicted_probs_audio.pt')

In [20]:
model.save_pretrained("../models/audio_wav2vec_train/best_model")