# Initialisation

In [None]:
!pip install datasets>=1.18.3
!pip install librosa
!pip install seaborn
!pip install jiwer

!pip install -U accelerate
!pip install -U transformers

In [None]:
! pip install Levenshtein

In [2]:
!pip install -U -q PyDrive

In [4]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
zip_file_id = '1HjvCba3D2474LcRflTQhAOQyyb1ohrr2'

zip_file = drive.CreateFile({'id': zip_file_id})

zip_file.GetContentFile('timid_2.zip', mimetype='application/zip')

In [None]:
! unzip -u /content/timid_2.zip

# Dataset

In [7]:
from datasets import load_dataset, load_metric, Audio, Dataset
from tqdm.auto import tqdm

import os
import numpy as np
import pandas as pd
import torchaudio

os.environ["WANDB_DISABLED"] = "true"

timit_path = '/content/TIMIT/'
data_path = '/content/TIMIT/data'

In [8]:
df_train = pd.read_csv(os.path.join(timit_path, 'train_data.csv'))
df_test = pd.read_csv(os.path.join(timit_path, 'test_data.csv'))

df_train = df_train[df_train['is_converted_audio'] == False]
df_test = df_test[df_test['is_converted_audio'] == False]

In [9]:
data_train = {}

for idx, row in tqdm(df_train.iterrows()):
    path = row['path_from_data_dir']
    entry_id = path.split('.')[0]

    if entry_id not in data_train:
        data_train[entry_id] = {}

    if row['is_audio'] is True:
        data_train[entry_id]['audio_file'] = os.path.join(data_path, path)
    elif row['is_word_file'] is True:
        data_train[entry_id]['word_file'] = os.path.join(data_path, path)
    elif row['is_phonetic_file'] is True:
        data_train[entry_id]['phonetic_file'] = os.path.join(data_path, path)

data_test = {}
for idx, row in tqdm(df_test.iterrows()):
    path = row['path_from_data_dir']
    entry_id = path.split('.')[0]

    if entry_id not in data_test:
        data_test[entry_id] = {}

    if row['is_audio'] is True:
        data_test[entry_id]['audio_file'] = os.path.join(data_path, path)
    elif row['is_word_file'] is True:
        data_test[entry_id]['word_file'] = os.path.join(data_path, path)
    elif row['is_phonetic_file'] is True:
        data_test[entry_id]['phonetic_file'] = os.path.join(data_path, path)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [52]:
import random
print(len(data_train))
print(len(data_test))

train_keys = [key for key in data_train.keys() if len(data_train[key]) == 3]
test_keys = [key for key in data_test.keys() if len(data_test[key]) == 3]



validation_percentage = 0.2
random.shuffle(train_keys)
num_validation_keys = int(len(train_keys) * validation_percentage)
valid_keys = train_keys[:num_validation_keys]
train_keys = train_keys[num_validation_keys:]

train = {key: data_train[key] for key in train_keys}
valid = {key: data_train[key] for key in valid_keys}
test = {key: data_test[key] for key in test_keys}

print(len(train))
print(len(test))

4620
1680
1344
1680


In [11]:
import librosa

def get_durations(dict_data):
    total_durations = 0

    for entry in dict_data.values():
        audio_data, _ = librosa.load(entry['audio_file'], sr=16_000)
        duration = len(audio_data) / 16_000
        total_durations += duration

    return int(total_durations)

print(f"Duration of Train: {get_durations(train) // 60} mns")
print(f"Duration of Test : {get_durations(test) // 60} mns")

Duration of Train: 85 mns
Duration of Test : 86 mns


In [53]:
import json

with open("/content/working/custom_train.json", "w") as f:
    json.dump(train, f)
with open("/content/working/custom_valid.json", "w") as f:
    json.dump(valid, f)
with open("/content/working/custom_test.json", "w") as f:
    json.dump(test, f)

## Convert to Dataset Format

In [54]:
from sklearn.model_selection import train_test_split

def convert_to_feature_dict(data_dict):
    audio_files = []
    word_files = []
    phonetic_files = []
    for key, value in data_dict.items():
        audio_files.append(value['audio_file'])
        word_files.append(value['word_file'])
        phonetic_files.append(value['phonetic_file'])

    return {
        'audio_file': audio_files,
        'word_file': word_files,
        'phonetic_file': phonetic_files
    }


train_dataset = Dataset.from_dict(convert_to_feature_dict(train))
valid_dataset = Dataset.from_dict(convert_to_feature_dict(valid))
test_dataset = Dataset.from_dict(convert_to_feature_dict(test))

print(train_dataset)

Dataset({
    features: ['audio_file', 'word_file', 'phonetic_file'],
    num_rows: 1344
})


In [55]:
def read_text_file(filepath):
    with open(filepath) as f:
        tokens = [line.split()[-1] for line in f]
        return " ".join(tokens)

def prepare_text_data(item):
    item['text'] = read_text_file(item['word_file'])
    item['phonetic'] = read_text_file(item['phonetic_file'])
    return item


train_dataset = (train_dataset
                 .map(prepare_text_data)
                 .remove_columns(["word_file", "phonetic_file"]))
valid_dataset = (valid_dataset
                 .map(prepare_text_data)
                 .remove_columns(["word_file", "phonetic_file"]))
test_dataset  = (test_dataset
                 .map(prepare_text_data)
                 .remove_columns(["word_file", "phonetic_file"]))

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]

Map:   0%|          | 0/336 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

## Normalize the Phonetics

In [56]:
train_phonetics = [phone for x in train_dataset for phone in x['phonetic'].split()]
print("num of train phones:\t", len(set(train_phonetics)))

num of train phones:	 61


In [57]:
phon61_map39 = {
    'iy':'iy',  'ih':'ih',   'eh':'eh',  'ae':'ae',    'ix':'ih',  'ax':'ah',   'ah':'ah',  'uw':'uw',
    'ux':'uw',  'uh':'uh',   'ao':'aa',  'aa':'aa',    'ey':'ey',  'ay':'ay',   'oy':'oy',  'aw':'aw',
    'ow':'ow',  'l':'l',     'el':'l',  'r':'r',      'y':'y',    'w':'w',     'er':'er',  'axr':'er',
    'm':'m',    'em':'m',     'n':'n',    'nx':'n',     'en':'n',  'ng':'ng',   'eng':'ng', 'ch':'ch',
    'jh':'jh',  'dh':'dh',   'b':'b',    'd':'d',      'dx':'dx',  'g':'g',     'p':'p',    't':'t',
    'k':'k',    'z':'z',     'zh':'sh',  'v':'v',      'f':'f',    'th':'th',   's':'s',    'sh':'sh',
    'hh':'hh',  'hv':'hh',   'pcl':'h#', 'tcl':'h#', 'kcl':'h#', 'qcl':'h#','bcl':'h#','dcl':'h#',
    'gcl':'h#','h#':'h#',  '#h':'h#',  'pau':'h#', 'epi': 'h#','nx':'n',   'ax-h':'ah','q':'h#'
}

def convert_phon61_to_phon39(sentence):
    tokens = [phon61_map39[x] for x in sentence.split()]
    return " ".join(tokens)

def normalize_phones(item):
    item['phonetic'] = convert_phon61_to_phon39(item['phonetic'])
    return item

In [58]:
train_dataset = train_dataset.map(normalize_phones)
valid_dataset = train_dataset.map(normalize_phones)
test_dataset = test_dataset.map(normalize_phones)

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

In [59]:
train_phonetics = [phone for x in train_dataset for phone in x['phonetic'].split()]
valid_phonetics = [phone for x in valid_dataset for phone in x['phonetic'].split()]
test_phonetics = [phone for x in test_dataset for phone in x['phonetic'].split()]

print("num of train phones:\t", len(set(train_phonetics)))
print("num of test phones:\t", len(set(test_phonetics)))

num of train phones:	 39
num of test phones:	 39


## Load Audio

In [60]:
train_dataset = (train_dataset
                 .cast_column("audio_file", Audio(sampling_rate=16_000))
                 .rename_column('audio_file', 'audio'))
valid_dataset = (valid_dataset
                 .cast_column("audio_file", Audio(sampling_rate=16_000))
                 .rename_column('audio_file', 'audio'))
test_dataset = (test_dataset
                 .cast_column("audio_file", Audio(sampling_rate=16_000))
                 .rename_column('audio_file', 'audio'))

In [61]:
vocab_train = list(set(train_phonetics)) + [' ']
vocab_valid = list(set(valid_phonetics)) + [' ']
vocab_test  = list(set(test_phonetics)) + [' ']

vocab_list = list(set(vocab_train + vocab_valid + vocab_test))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

print(vocab_dict)

{' ': 0, 'aa': 1, 'ae': 2, 'ah': 3, 'aw': 4, 'ay': 5, 'b': 6, 'ch': 7, 'd': 8, 'dh': 9, 'dx': 10, 'eh': 11, 'er': 12, 'ey': 13, 'f': 14, 'g': 15, 'h#': 16, 'hh': 17, 'ih': 18, 'iy': 19, 'jh': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'ng': 25, 'ow': 26, 'oy': 27, 'p': 28, 'r': 29, 's': 30, 'sh': 31, 't': 32, 'th': 33, 'uh': 34, 'uw': 35, 'v': 36, 'w': 37, 'y': 38, 'z': 39}


In [62]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

In [63]:
import json
with open('/content/working/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Modeling

Wav2Vec


## Input

In [64]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from tokenizers.processors import TemplateProcessing

In [65]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    "/content/working/", unk_token="[UNK]", pad_token="[PAD]",
    word_delimiter_token="|", )
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [66]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["phonetic"]).input_ids
    return batch

train_dataset = train_dataset.map(prepare_dataset)
valid_dataset = valid_dataset.map(prepare_dataset)
test_dataset = test_dataset.map(prepare_dataset)

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]



Map:   0%|          | 0/1344 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

In [73]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [74]:
cer_metric = load_metric("cer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids)
    label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {
        "cer": cer
    }

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


# Model & Training

In [75]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    attention_dropout=0.1,
    layerdrop=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.75,
    mask_time_length=10,
    mask_feature_prob=0.25,
    mask_feature_length=64,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
model.freeze_feature_encoder()

**Let's limit ourselves to just 50 steps. While we might not be able to observe a clear trend, it would be more manageable for our own learning purposes and save us from waiting for hours.**

In [82]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    group_by_length=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    fp16=True,
    max_steps=50,
    save_steps=10,  #100,
    eval_steps=10,
    logging_steps=10,
    learning_rate=3e-5,
    warmup_steps=20,
    save_total_limit=3,
    load_best_model_at_end=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [109]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [84]:
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

Step,Training Loss,Validation Loss,Cer
10,10.7074,10.571594,0.961402
20,9.0879,8.536736,1.0
30,6.6484,5.760906,1.0
40,5.611,5.079889,1.0
50,5.2903,4.917608,1.0




TrainOutput(global_step=50, training_loss=7.469026184082031, metrics={'train_runtime': 546.4793, 'train_samples_per_second': 2.928, 'train_steps_per_second': 0.091, 'total_flos': 1.5388155911913984e+17, 'train_loss': 7.469026184082031, 'epoch': 1.19})

In [85]:
trainer.evaluate(test_dataset)



{'eval_loss': 4.943404197692871,
 'eval_cer': 1.0,
 'eval_runtime': 74.7005,
 'eval_samples_per_second': 22.49,
 'eval_steps_per_second': 2.811,
 'epoch': 1.19}

In [86]:
trainer.evaluate(train_dataset)

{'eval_loss': 4.917608261108398,
 'eval_cer': 1.0,
 'eval_runtime': 57.7432,
 'eval_samples_per_second': 23.275,
 'eval_steps_per_second': 2.909,
 'epoch': 1.19}

# Fine-tuning using different heads

In [111]:
import torch.nn as nn

class LinearHead(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearHead, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

class MLPHead(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPHead, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [88]:
# Linear head
linear_head = LinearHead(input_size=model.config.hidden_size, output_size=len(processor.tokenizer))
model.lm_head = linear_head
trainer.train()



Step,Training Loss,Validation Loss,Cer
10,11.049,11.148504,0.893231
20,11.2075,11.14569,0.899522
30,11.2461,11.143713,0.902335
40,11.1462,11.143122,0.903288
50,11.1077,11.143026,0.903231


Checkpoint destination directory /kaggle/working/checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /kaggle/working/checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=50, training_loss=11.151270904541015, metrics={'train_runtime': 567.0713, 'train_samples_per_second': 2.822, 'train_steps_per_second': 0.088, 'total_flos': 1.5392947227785088e+17, 'train_loss': 11.151270904541015, 'epoch': 1.19})

In [89]:
trainer.evaluate(test_dataset)



{'eval_loss': 11.231651306152344,
 'eval_cer': 0.9025978211822343,
 'eval_runtime': 73.9974,
 'eval_samples_per_second': 22.704,
 'eval_steps_per_second': 2.838,
 'epoch': 1.19}

In [132]:
# MLP head
mlp_head = MLPHead(input_size=model.config.hidden_size, hidden_size=512, output_size=len(processor.tokenizer))
model.lm_head = mlp_head
trainer.train()



Step,Training Loss,Validation Loss,Cer
10,11.1063,11.154658,0.868479
20,11.3492,11.081032,0.895906
30,10.9309,10.992148,0.963928
40,11.0043,10.937734,0.999105
50,10.9083,10.918483,0.999931


Checkpoint destination directory /kaggle/working/checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /kaggle/working/checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=50, training_loss=11.059790344238282, metrics={'train_runtime': 729.4765, 'train_samples_per_second': 2.193, 'train_steps_per_second': 0.069, 'total_flos': 1.5449764829651578e+17, 'train_loss': 11.059790344238282, 'epoch': 1.19})

In [91]:
trainer.evaluate(test_dataset)



{'eval_loss': 11.324196815490723,
 'eval_cer': 0.9737086867235775,
 'eval_runtime': 73.7695,
 'eval_samples_per_second': 22.774,
 'eval_steps_per_second': 2.847,
 'epoch': 1.19}

# Visualise

In [126]:
test_dataset[4]['audio']

{'path': None,
 'array': array([ 3.96728516e-04, -9.15527344e-05,  3.05175781e-05, ...,
         3.05175781e-05,  1.52587891e-04,  3.05175781e-05]),
 'sampling_rate': 16000}

In [135]:
predictions = trainer.predict(test_dataset)



In [None]:
model_output = predictions.predictions

In [None]:
with torch.no_grad():
    model_output = model(test_dataset['input_values'])

In [None]:
speaker_w2v_10 = model_output.last_hidden_state.cpu()

In [None]:
import matplotlib.pyplot as plt

plt.title("Padded W2V")
plt.imshow(speaker_w2v_10[0].T)
plt.show()