<a href="https://colab.research.google.com/github/abhisinghh/ml4audio/blob/main/fine_tuning_ml_models_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torchaudio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
pip install transformers datasets evaluate



In [3]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")



In [4]:
minds = minds.train_test_split(test_size=0.2)

In [5]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

In [6]:
minds = minds.remove_columns(['path', 'transcription', 'english_transcription', 'intent_class' ])

In [7]:
labels = minds['train'].features['lang_id'].names

In [8]:
label2id, id2label = dict(), dict()
for i , label in enumerate(labels) :
  label2id[label] = str(i)
  id2label[str(i)] = label

In [9]:
id2label, label2id

({'0': 'cs-CZ',
  '1': 'de-DE',
  '2': 'en-AU',
  '3': 'en-GB',
  '4': 'en-US',
  '5': 'es-ES',
  '6': 'fr-FR',
  '7': 'it-IT',
  '8': 'ko-KR',
  '9': 'nl-NL',
  '10': 'pl-PL',
  '11': 'pt-PT',
  '12': 'ru-RU',
  '13': 'zh-CN'},
 {'cs-CZ': '0',
  'de-DE': '1',
  'en-AU': '2',
  'en-GB': '3',
  'en-US': '4',
  'es-ES': '5',
  'fr-FR': '6',
  'it-IT': '7',
  'ko-KR': '8',
  'nl-NL': '9',
  'pl-PL': '10',
  'pt-PT': '11',
  'ru-RU': '12',
  'zh-CN': '13'})

In [10]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [11]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))


In [12]:
minds["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-US~LATEST_TRANSACTIONS/602baeea963e11ccd901ce14.wav',
  'array': array([-1.80856150e-05,  8.24753806e-06,  2.10441049e-05, ...,
         -2.89130141e-03,  1.26329833e-04,  1.91958412e-03]),
  'sampling_rate': 16000},
 'lang_id': 4}

In [13]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [14]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.bias', 'project_hid.bias', 'quantizer.weight_proj.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'projector.bias', 'classifier.

In [15]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("lang_id", "label")

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [16]:
! pip install -U accelerate
! pip install -U transformers



In [17]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.30.2', '0.21.0')

In [18]:
import evaluate

accuracy = evaluate.load("accuracy")

In [19]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [20]:
training_args = TrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,No log,2.523622,0.716814
1,No log,2.074497,1.0
2,2.311100,1.7001,1.0
4,2.311100,1.456949,1.0
4,2.311100,1.337486,1.0
5,1.517600,1.240804,1.0
6,1.517600,1.193649,1.0
8,1.253900,1.171089,1.0


TrainOutput(global_step=30, training_loss=1.6941734631856282, metrics={'train_runtime': 64.8842, 'train_samples_per_second': 69.354, 'train_steps_per_second': 0.462, 'total_flos': 3.26841433344e+16, 'train_loss': 1.6941734631856282, 'epoch': 8.0})

In [None]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [22]:
model

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
