In [1]:
from datasets import Dataset, Audio

In [2]:
import pickle
from typing import Dict

def load_pickle(pickle_file) -> Dict:
    try:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
    except UnicodeDecodeError as e:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f, encoding='latin1')
    except Exception as e:
        print('Unable to load data ', pickle_file, ':', e)
        raise
    return pickle_data

In [3]:
idxs = load_pickle('../data/humor_label_sdk.pkl')
keys = list(idxs.keys())
labels = list(idxs.values())

In [4]:
labels = labels[0:16]

In [5]:
audio = [f"../data/urfunny2_audio/{idx}.mp3" for idx in [1, 3, 4, 5, 7, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20]]

In [6]:
audio_ds = Dataset.from_dict({'audio' : audio, 'label': labels})
audio_ds = audio_ds.train_test_split(test_size=0.3)

In [7]:
audio_ds = audio_ds.cast_column("audio", Audio(sampling_rate=16_000))

In [8]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs



In [9]:
encoded_audio_ds = audio_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [10]:
encoded_audio_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'label', 'input_values'],
        num_rows: 11
    })
    test: Dataset({
        features: ['audio', 'label', 'input_values'],
        num_rows: 5
    })
})

In [11]:
label2id = {"HUMOR" : 1, "NONHUMOR" : 0}
id2label = {0 : "NONHUMOR", 1 : "NONHUMOR"}

In [12]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [13]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.weight', 'project_q.weight', 'quantizer.codevectors', 'project_hid.bias', 'project_q.bias', 'quantizer.weight_proj.bias', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier

In [14]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_audio_ds["train"],
    eval_dataset=encoded_audio_ds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=0.6854737401008606, metrics={'train_runtime': 23.763, 'train_samples_per_second': 0.926, 'train_steps_per_second': 0.168, 'total_flos': 145258122240000.0, 'train_loss': 0.6854737401008606, 'epoch': 1.45})

In [16]:
trainer.predict(encoded_audio_ds['test'])

PredictionOutput(predictions=array([[ 0.02476062, -0.07539974],
       [ 0.05651618, -0.03700653],
       [ 0.04202554, -0.05559085],
       [ 0.04119868, -0.08987164],
       [ 0.03025232, -0.06811029]], dtype=float32), label_ids=array([1, 1, 1, 1, 1]), metrics={'test_loss': 0.7465988397598267, 'test_accuracy': 0.0, 'test_runtime': 0.762, 'test_samples_per_second': 6.562, 'test_steps_per_second': 6.562})