## GPU Configurations used 


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Tue May 25 13:35:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Loading the necessary libraries and datasets

Here we have saved the entire data in a zipped folder - "AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF.zip"
<br>
This zipped folder contains-
* Train and test csv
* Audio clips related to the both train and test data that needs to be unzipped

<br>
We will also create a folder "model_asr" where model checkpoints will be saved.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir datasets
!unzip '/content/drive/MyDrive/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF.zip' -d datasets
!unzip '/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips.zip' -d '/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF'

!mkdir model_asr

!pip install --upgrade torch
!pip install --upgrade datasets
!pip install --upgrade transformers
!pip install --upgrade torchaudio
!pip install --upgrade librosa
!pip install --upgrade jiwer
!pip install --upgrade audiomentations
!pip install --upgrade fuzzywuzzy


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips/501ec5ade65f06ac7eeaa5e1843d054aaf2e2d010a422b50387c9bf5367cbc7b03131f392b3a4c129966470d58ab01a21c48b6c54d95c351835ee746e4697f5b.mp3  
  inflating: /content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips/bc4f81694d79fc1dacd665a8cc95e82899f32c21ff57ddcce0049b1779fb523654f125555ed16b09a2712112039133803b6bfb89e47489d15ffaafbda4315465.mp3  
  inflating: /content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips/890343656db76dc7134d640ef92fe53b2065c024d5f8f365044d120a72679158fc4a253a13ccc81a9f4df1c2685b169a7bc9416d5d3cbd08a53b025fd458e08b.mp3  
  inflating: /content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips/373f142b40729362862b20fb360a05d6362c0ad4917a8059e36a4b8002b45a43fb5ccfac6582ef4984eb5a84e55728f01d16e1748776d39bc18dac4bf86dcb2c.m

In [None]:
import pandas as pd
import numpy as np
import re
import json
import random

from fuzzywuzzy import fuzz

import torch
import torchaudio
import librosa

from sklearn.model_selection import train_test_split

from datasets import Dataset, load_metric
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor


from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer




def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(13)

## Loading the train and test datasets.
<br> 
1. __Oversampling of less freqency data points__ :- We will oversample the datapoints where the frequency is less than 8 for training. 
2. __Train-test split__ :- We will now use stratified train test split with train-test split being (90-10) on the oversampled data 

In [None]:
import numpy as np

np.random.seed(13)

train_df               = pd.read_csv('/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/Train.csv')
train_df['audio_path'] = '/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips/'+train_df['ID']+'.mp3'

val_count    = train_df['transcription'].value_counts()
cols2sample  = val_count[val_count<=8].index.tolist()

for i in range(len(cols2sample)):
  
  target_transcript = cols2sample[i]

  count = val_count[target_transcript]

  aug_df   = train_df[train_df['transcription']==target_transcript].sample(9-count,replace=True)
  train_df = train_df.append(aug_df)


test_df                = pd.read_csv('/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/Test.csv')
test_df['audio_path']  = '/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/clips/'+test_df['ID']+'.mp3'

print(f'The shape of train data :- {train_df.shape} and shape of test data :- {test_df.shape}')
print(f'Columns in train data:- {train_df.columns.tolist()}')
print(f'Columns in test data:- {test_df.columns.tolist()}')

## We need to find the 'transcription column' from the test dataset.



train_df_sub = train_df[['audio_path','transcription']]

split = 0.1
train_data_audio,test_data_audio,train_data_target,test_data_target = train_test_split(train_df_sub['audio_path'],train_df_sub['transcription'],
                                                                                       random_state = 42,test_size=split,stratify=train_df_sub['transcription'])

train_data = pd.DataFrame(pd.concat({'audio_path':train_data_audio,'transcription':train_data_target},axis=1))
train_data = train_data.reset_index(drop=True)

test_data  = pd.DataFrame(pd.concat({'audio_path':test_data_audio,'transcription':test_data_target},axis=1))
test_data  = test_data.reset_index(drop=True)

print(f'The shape of train data and validation data after train-test split of {int((1-split)*100)}%-{int(split*100)}%  :- {train_data.shape,test_data.shape}')

The shape of train data :- (7520, 7) and shape of test data :- (1564, 6)
Columns in train data:- ['ID', 'up_votes', 'down_votes', 'age', 'gender', 'transcription', 'audio_path']
Columns in test data:- ['ID', 'up_votes', 'down_votes', 'age', 'gender', 'audio_path']
The shape of train data and validation data after train-test split of 90%-10%  :- ((6768, 2), (752, 2))


In [None]:
train_data = Dataset.from_pandas(train_data)
test_data  = Dataset.from_pandas(test_data)

## Creating and saving the Vocabulary
We will save the character based vocabulary based on train transcriptions as vocab.json file in the model_asr folder.
<br>
Important points:- 
1. We select some characters to remove and also convert entire batch of transcriptions to lowercase (to ensure uniformity) in both train and test dataset. 
2. We also remove other column names like Downvotes, Gender, upvotes  because they were not needed as of now for analysis (even though they can be used for data augmentation while training but to reduce the runtime and get reasonable results early, we dropped those columns)
3. We eventually saved the characters in vocab.json in model_asr folder where it will be used by the model we will be calling in later stages


In [None]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\)\(\�]'
def remove_special_characters(batch):
  '''
      We remove special characters from each sentence.
  '''
  batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
  return batch

train_data = train_data.map(remove_special_characters)
test_data  = test_data.map(remove_special_characters)

def extract_all_chars(batch):
  all_text = " ".join(batch["transcription"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = train_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_data.column_names)
vocab_test  = test_data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=6768.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=752.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
## vocabulary list from the train dataset
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))  ## provides a list of characters from a list
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print('The total vocabulary present in train transcriptions is ',len(vocab_dict))

The total vocabulary present in train transcriptions is  39


In [None]:
with open('/content/model_asr/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

## Tokenizing and Feature Extraction
1. We use the vocabulary extracted in the previous step and feed it into the Wav2Vec2CTCTokenizer and also specify some basic tokens to be used while training 
2. The feature extraction steps will include the usage of feature_extractor with default values which enables us to get a float array of raw waveform of the clips. <br>
One important thing to note over here is the fact that we are using the samling rate as 16000 hz/second because our model is trained on the input wav form of 16000 hz/second only
3. We eventually save our processor which includes our tokenizer (from point1 and feature extractor from point2) into the model_asr folder

In [None]:
tokenizer         = Wav2Vec2CTCTokenizer("/content/model_asr/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor         = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

processor.save_pretrained("/content/model_asr")

### Here we convert the speech file(audio clips) to array 

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
    batch["speech"] = librosa.resample(np.asarray(speech_array[0].numpy()), 48_000, 16_000)
    batch["sampling_rate"] = 16_000
    batch["target_text"] = batch["transcription"]
    return batch

train_data = train_data.map(speech_file_to_array_fn, remove_columns=train_data.column_names)
test_data = test_data.map(speech_file_to_array_fn, remove_columns=test_data.column_names)

HBox(children=(FloatProgress(value=0.0, max=6768.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=752.0), HTML(value='')))




In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, batch_size=8, batched=True)
test_data = test_data.map(prepare_dataset, remove_columns=test_data.column_names, batch_size=8, batched=True)

HBox(children=(FloatProgress(value=0.0, max=846.0), HTML(value='')))

  return array(a, dtype, copy=False, order=order)





HBox(children=(FloatProgress(value=0.0, max=94.0), HTML(value='')))




## We create a Data Collator class here. 
The entire code is taken here - https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81
<br>
The arguments corresponding to the class are explained in the docstring below. We tried not to experiment much with this class because a small tweak(like changing max_length) was giving really bizarre results and hence we took the class as it is without any changes from the repo specified above.
<br>
Also, we initialise the metric to be used for calculations which is Word Error Rate.



In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1947.0, style=ProgressStyle(description…




In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Setting up model parameters

1. Model used is facebook's Wav2vec2-large-xlsr-53 which is a wav2vec model trained over 53 languages and we can finetune it to the language we need. 
2. The parameters selected in below steps are obtained by running WandB sweeps. With default parameters we were not getting good results but the sweeps helped us a lot in getting a lift in performance. 
3. The overall runtime of the trainer is about 12 hours on the GPU specified above and the checkpoints are saved in the folder model_asr as specified.

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.015715766711072065,
    feat_proj_dropout= 0.07570439532163029,
    activation_dropout=0.09145432252955588,
    hidden_dropout=0.0006515376406130203,
    # feat_proj_dropout=0.0,
    mask_time_prob=0.05353409500178331,
    layerdrop=0.018085056635857365,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)


model.freeze_feature_extractor()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1451.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1261920069.0, style=ProgressStyle(descr…




Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

training_args = TrainingArguments(
  output_dir="/content/model_asr",
  
  group_by_length=True,
  per_device_train_batch_size=4,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=80,
  dataloader_num_workers= 32, 
  load_best_model_at_end=True,
  metric_for_best_model='wer',
  greater_is_better=False,
  fp16=True,
  seed=13,
  save_steps=100,
  eval_steps=500,
  logging_steps=500,
  learning_rate=0.000095637994662983496,
  lr_scheduler_type = 'cosine_with_restarts',
  warmup_steps=500,
  save_total_limit=1,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

  cpuset_checked))


Step,Training Loss,Validation Loss,Wer
500,8.6047,2.917415,1.0
1000,1.7568,0.628265,0.8398
1500,0.6302,0.284027,0.599917
2000,0.3882,0.200102,0.516062
2500,0.2838,0.141981,0.44514
3000,0.2237,0.109221,0.415102
3500,0.1818,0.083903,0.394243
4000,0.1467,0.078651,0.394243
4500,0.1289,0.081152,0.390071
5000,0.1281,0.066647,0.37839


  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_checked))
  cpuset_che

TrainOutput(global_step=67680, training_loss=0.1125287169321984, metrics={'train_runtime': 42779.5062, 'train_samples_per_second': 1.582, 'total_flos': 1.306027282581504e+17, 'epoch': 80.0, 'init_mem_cpu_alloc_delta': 1495904256, 'init_mem_gpu_alloc_delta': 1261915136, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 607985664, 'train_mem_gpu_alloc_delta': 3782020608, 'train_mem_cpu_peaked_delta': 142192640, 'train_mem_gpu_peaked_delta': 1349468672})

## Extracting the relevent checkpoint to be used on test dataset
We extract the checkpoint having least eval-WER from the checkpoints saved in the folder. The model and tokenizer are initialised with the model and tokenizer saved in the checkpoint and usual steps to preprocess are applied again as we did for train data to get the results needed.

In [None]:
eval_steps = 500
eval_wer = []
for log_history in trainer.state.log_history:
  if 'eval_wer' in log_history.keys():
    eval_wer.append(log_history['eval_wer'])
steps_taken = [eval_steps*(i+1) for i in range(len(eval_wer))]

eval_dict = dict(zip(steps_taken,eval_wer))


import os
files = os.listdir('/content/model_asr')
checkpoints_available = []
for file in files:
  if 'checkpoint' in file:
    checkpoints_available.append(file)

least_loss = 2
for chkpts in checkpoints_available:  
  number = int(re.findall(r'\d+', chkpts)[0])
  if eval_dict[number]< least_loss:
    least_loss = eval_dict[number]
    chkpt2consider = '/content/model_asr/'+chkpts

In [None]:
chkpt2consider

'/content/model_asr/checkpoint-54000'

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(chkpt2consider).to("cuda")
processor = Wav2Vec2Processor.from_pretrained("/content/model_asr")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
test_df = test_df[['audio_path']]
test_data  = Dataset.from_pandas(test_df)



def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["audio_path"])
    batch["speech"] = librosa.resample(np.asarray(speech_array[0].numpy()), 48_000, 16_000)
    batch["sampling_rate"] = 16_000
    # batch["speech"] = speech_array[0].numpy()
    # batch["sampling_rate"] = sampling_rate
    return batch

test_data = test_data.map(speech_file_to_array_fn, remove_columns=test_data.column_names)


# def resample(batch):
#     batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 48_000, 16_000)
#     batch["sampling_rate"] = 16_000
#     return batch

# test_data = test_data.map(resample)


HBox(children=(FloatProgress(value=0.0, max=1564.0), HTML(value='')))




In [None]:
def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch


In [None]:
result = test_data.map(evaluate, batched=True, batch_size=8)


HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))




## Saving the transcriptions and post processing (part 1)
In the first step of post processing, we remove special tokens from the transcriptions as well as any extra trailing space in the text attained.
<br>
Also, there was a datapoint which was empty (no transcription possible due to blank audio). In order to accommodate that, we make an adjustment based on sequence length of transcription attained (else the submissions were throwing error if '' is used instead of ' '). 

In [None]:
outputdf = pd.read_csv('/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/Test.csv')
output = result["pred_strings"]
outputdf['transcription'] = output

In [None]:
subdf = outputdf[['ID','transcription']]
subdf.transcription=subdf.transcription.str.replace('\[PAD\]','')

subdf['transcription']=subdf['transcription'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
subdf['length'] = subdf['transcription'].str.len()
subdf['transcription'] = subdf.apply(lambda z: z['transcription'] if z['length']>0 else " ",axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Post Processing (part 2) 
1. We define a function to match the transcriptions attained in previous step with the transcriptions available in train data. <br>
Reason :- If we consider the problem statement, we can see that the geographical locations are finite (approx 700 unique transcriptions in train data corresponding to 700 locations in Senegal). Hence, any minute error in translation due to accent or gender can be rectified by finding the closest match with the available location in train data .
<br>
For example:- "Africatel avnes" attained from model can be mapped to "Africatel AVS" and on doing so, the WER is minimized even further.
2. We set up a threshold above which we will change the predictions if match happens and that was 45%. This means any transcription matching atleast 45% with the given set of transcriptions can be changed to the matched one.

In [None]:
def sim_text(text):
  match_dict = {}

  for vals in transcription_list_lower:
    # match_dict[vals] = fuzz.ratio(sorted(text),sorted(vals))
    match_dict[vals] = fuzz.ratio(text,vals)
  
  return max(match_dict,key=match_dict.get),max(match_dict.values())

In [None]:
train_df = pd.read_csv('/content/datasets/AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF/Train.csv')
outputdf = subdf.copy()

In [None]:
transcription_list = train_df['transcription'].unique().tolist()
transcription_list_lower = [a.lower() for a in transcription_list]
corresponding_dict = dict(zip(transcription_list_lower,transcription_list))

In [None]:
outputdf['results'] = outputdf['transcription'].apply(lambda z: sim_text(z))
outputdf['match_results'] = outputdf['results'].apply(lambda z: z[0])
outputdf['match_percent'] = outputdf['results'].apply(lambda z: z[1])


In [None]:
outputdf['match_results'] = outputdf['match_results'].map(corresponding_dict)
outputdf = outputdf.drop(['results'],axis=1)

In [None]:
outputdf['final'] = outputdf.apply(lambda z: z['match_results'] if z['match_percent']>=45 else z['transcription'],axis=1)
correcteddf = pd.DataFrame(columns=['ID','transcription'])
correcteddf['ID'] = outputdf['ID'].values
correcteddf['transcription'] = outputdf['final'].values
correcteddf.to_csv('last_model_trained_fuzzyratio.csv',index=False)

In [None]:
outputdf

Unnamed: 0,ID,transcription,length,match_results,match_percent,final
0,00416cff4f818d3dfd99c9178ff0e268e7575500c8baa5...,africatel avs,13,Africatel AVS,100,Africatel AVS
1,00891ba561e80e135f9d12b9fa1347f0a2560998f7ea16...,nan laay def ngir dem tally bou bess,36,nan laay def ngir dem Tally bou Bess,100,nan laay def ngir dem Tally bou Bess
2,00a508027ed4edf0bd3db79f45f4ed6e1b89fba6482c10...,africatel avs,13,Africatel AVS,100,Africatel AVS
3,00ac13cd0d93e35c1ff672cc106ad94d1ea9b93fcf049a...,mosquée de cambérène,20,Mosquée de Cambérène,100,Mosquée de Cambérène
4,00c2d5baf4719bf01b990a8924e99bda043cd462147193...,cité safco tivaoune peulh,25,Cité Safco Tivaoune Peulh,100,Cité Safco Tivaoune Peulh
...,...,...,...,...,...,...
1559,ff0da457e7a3986035995912803e42261c5f5f448c126b...,bank of africa pikine,21,Bank of africa Pikine,100,Bank of africa Pikine
1560,ff1808218a15fa576c405314e4de4bda56c44f849ff1b5...,tigo almadies,13,Tigo Almadies,100,Tigo Almadies
1561,ff5b9a45d60600e875e0a031b1d7076c9cbdeb1c48c09c...,gouy gui grand mbao,19,Gouy Gui Grand Mbao,100,Gouy Gui Grand Mbao
1562,ff98e108ec61d3bd485734b83f21be77820549dab1cac1...,pharmacie rokhaya ouakam,24,Pharmacie Rokhaya Ouakam,100,Pharmacie Rokhaya Ouakam
