## Solution Summary
We started off by fine-tunning facebook's wav2vec2-xlsr-53 on the Wolof data for our acoustic model (AM) with the help of Hugging Face. After looking at both the training transcriptions and the test predictions genreated by the AM, we noticed a very large overlap in sentences. This prompted us to create a "sentence autocorrect" algorithm utilising lvenshtien distance which was subsequently used to post-processs the AM model predictions.

Expected notebook runtime (training on a RTX 2070 super): 8-9hrs 

## Imports and enviroment
We use `Anconda 4.10.1` as our Python distribution. Please use the included `enviroment.yml` file to ensure all the correct packages and their respective versions are installed.

In [None]:
from pathlib import Path
import os
import warnings
import random

import re
import json
import pandas as pd
from datasets import ClassLabel, Dataset, DatasetDict
import librosa
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union



import torch
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2Processor

## Data preperation
- Load in Train.csv as pandas dataframes
- Remove bad transcriptions from training data
- Remove unneccsery columns
- Create path dictionary: `{ID:Path_to_recording}`
- Add audio file paths
- Convert both dataframes to a DatasetDict object (makes working with Hugging Face easier)
- Preproccess training transcriptions (remove some special tokens that are hard for the AM model to learn)
- Create char vocab dict and correspinding JSON file for the Wav2Vec2Processor 

In [None]:
wolof_train = pd.read_csv('Train.csv')

In [None]:
# remove bad transcriptions from df
wolof_train = wolof_train[wolof_train.down_votes == 0]

In [None]:
wolof_train = wolof_train.drop(["up_votes", "down_votes", "age", "gender"], axis = 1)

In [None]:
# Create dict of ID's and file paths
paths = {}
path = 'clips/'
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith(".mp3"):
            audio_path = os.path.join(root, file)
            p = Path(audio_path)
            id = p.parts[-1].split('.')[0]
            paths[id] = audio_path

In [None]:
wolof_train['file'] = [paths[id] for id in list(wolof_train['ID'])]
wolof_test['file'] = [paths[id] for id in list(wolof_test['ID'])]
wolof_test = wolof_test.drop(["ID"], axis = 1)
wolof_train = wolof_train.drop(["ID"], axis = 1)

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
wolof = DatasetDict({'train': Dataset.from_pandas(wolof_train[0:6000]), 'test': Dataset.from_pandas(wolof_train[6000:])})

In [None]:
wolof_test = DatasetDict({'test': Dataset.from_pandas(wolof_test)})

In [None]:
chars_to_ignore_regex = '[\"\?\.\!\-\;\:\(\)\,]'

def remove_special_characters(batch):
    batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
    return batch

In [None]:
wolof = wolof.map(remove_special_characters)

In [None]:
show_random_elements(wolof['train'], num_examples=20)

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["transcription"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocabs = wolof.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=wolof.column_names["train"])

In [None]:
vocab_list = list(set(vocabs["train"]["vocab"][0]))

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

In [None]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## Initializing for wav2vec2
- Create Wav2Vec2CTCTokenizer using the recently created vocab JSON file
- Create Wav2Vec2FeatureExtractor
- Read in audio files and add them to the DatasetDict
- Process inputs and targets 
- Define Data Collator
- Load and define WER metric
- Load the base wav2vec2-large-xlsr-53 model and freeze the feature extractor

In [None]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
#import soundfile as sf
import librosa
import warnings

warnings.filterwarnings("ignore")

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["file"], sr = 16000)
    batch["speech"] = speech_array.astype('float16')
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["transcription"]
    return batch

In [None]:
wolof = wolof.map(speech_file_to_array_fn, remove_columns=wolof.column_names["train"], num_proc=1)

In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [None]:
wolof_prepared = wolof.map(prepare_dataset, remove_columns=wolof.column_names["train"], batch_size=8, batched=True)

In [None]:
show_random_elements(wolof_prepared['train'], num_examples=20)

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from datasets import load_metric
wer_metric = load_metric("wer")

In [None]:
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
# Base model
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

In [None]:
model.freeze_feature_extractor()

In [None]:
torch.cuda.is_available()

## Train model
- Initiate training args
- Define trainer and train model
- Save AM model and processor for later

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./wav2vec2-wolof",
  group_by_length=True,
  per_device_train_batch_size=3,
  evaluation_strategy="steps",
  num_train_epochs=25,
  fp16=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=wolof_prepared["train"],
    eval_dataset=wolof_prepared["test"],
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('wav2vec-wolof-model')
processor.save_pretrained('wav2vec-wolof-processor')

## Make validation set predictions
You don't need to load in the model or processor if this notebook is being run start to finish, but we leave the code just incase.

In [None]:
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained('./wav2vec-wolof-model').to('cuda')
processor = Wav2Vec2Processor.from_pretrained("./wav2vec-wolof-processor")

In [None]:
def map_to_result(batch):
    model.to("cuda")
    input_values = processor(
      batch["speech"], 
      sampling_rate=batch["sampling_rate"], 
      return_tensors="pt"
    ).input_values.to("cuda")

    with torch.no_grad():
        logits = model(input_values).logits

        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_str"] = processor.batch_decode(pred_ids)[0]

    return batch

In [None]:
results = wolof["test"].map(map_to_result)

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["target_text"])))

In [None]:
show_random_elements(results.remove_columns(["speech", "sampling_rate"]))

# Make Test set AM model predictions
You don't need to load in the model or processor if this notebook is being run start to finish, but we leave the code just incase.
- Load in Test.csv as pandas dataframe
- Do same data prep as before
- Map the test predictions
- Save AM model predictions as a csv

In [None]:
model = Wav2Vec2ForCTC.from_pretrained('./wav2vec-wolof-trained-model-large').to('cuda')
processor = Wav2Vec2Processor.from_pretrained("./wav2vec-wolof-trained-processor-large")

In [None]:
wolof_test = pd.read_csv('Test.csv')
wolof_test = wolof_test.drop(["up_votes", "down_votes", "age", "gender"], axis = 1)

In [None]:
wolof_test['file'] = [paths[id] for id in list(wolof_test['ID'])]

In [None]:
wolof = DatasetDict({'test': Dataset.from_pandas(wolof_test)})

In [None]:
warnings.filterwarnings("ignore")
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["file"], sr = 16000)
    batch["speech"] = speech_array.astype('float16')
    batch["sampling_rate"] = sampling_rate
    return batch

In [None]:
wolof = wolof.map(speech_file_to_array_fn, num_proc=1)

In [None]:
def map_to_result(batch):
    model.to("cuda")
    input_values = processor(
      batch["speech"], 
      sampling_rate=batch["sampling_rate"], 
      return_tensors="pt"
    ).input_values.to("cuda")

    with torch.no_grad():
        logits = model(input_values).logits

        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  
    return batch

In [None]:
results = wolof["test"].map(map_to_result)

In [None]:
show_random_elements(results.remove_columns(["speech", "sampling_rate", "file", "ID"]))

In [None]:
sub = results.remove_columns(["speech", "sampling_rate", "file"])

In [None]:
sub.rename_column("pred_str", "Transcription")

In [None]:
sub_df = sub.to_pandas()

In [None]:
# This ID wasn't in our Test.csv so we had to add it manually. It was a blank audio file anyway.
sub_df = sub_df.append({"ID":"e3a74a8998f03c320f5a4923272247485832b1cd803528f5eb5a50aef3d29a78b436b3ea37c47763e9b9be8b3ee53435b51d3466345217ce5d6fcb9b48a53c63",  "pred_str":" "}, ignore_index=True)

In [None]:
sub_df.to_csv("AM_model_preds.csv", index=False)

## Sentence-level autocorrect
- Load in Train.csv and extract unique sentences
- Define autocorrect function
- Get corrected sentences
- Update and save prediction csv

In [None]:
import editdistance
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv('Train.csv')

sens = train_df.transcription.values
sens = list(set([str(s).lower().strip() for s in sens]))
len(sens)

def autocorrect(preds, dist_cutoff=10):
    new_p = []
    for p in preds:
        p = str(p).strip().lower()
        dists = [editdistance.distance(p, u) for u in sens]
        min_dist = min(dists)
        min_ind = dists.index(min_dist)
        
        if min_dist < dist_cutoff:
            new_p.append(sens[min_ind])
        else:
            new_p.append(p)
    return new_p

In [None]:
df = pd.read_csv('AM_model_preds.csv')

In [None]:
new_preds = autocorrect(df.pred_str.values, dist_cutoff = 10)

In [None]:
df['pred_str'] = new_preds

In [None]:
df.to_csv("final_preds.csv", index=False)