## Install Requirements

In [1]:
!pip install datasets librosa jiwer transformers colorednoise pyctcdecode torchaudio pip install git+https://github.com/openai/whisper.git 

[0mCollecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-75y3y40b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-75y3y40b
  Resolved https://github.com/openai/whisper.git to commit 9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d
  Preparing metadata (setup.py) ... [?25ldone
[0m

## Imports

In [2]:
import colorednoise as cn
import jiwer
import pandas as pd
import whisper
import numpy as np
import os
import torch
import soundfile as sf
import pickle
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

## Download Files

In [3]:
# set paths
datasets_path = os.path.join(os.getcwd(), 'datasets') 
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
# set device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [4]:
# copy files from gcp
!gsutil -m cp -n -r gs://capstone_datasets/speech-accent-archive ./datasets/

Skipping existing item: file://./datasets/speech-accent-archive/reading-passage.txt
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans1.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans2.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans4.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/albanian7.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/albanian3.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/amharic11.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/albanian8.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/amharic1.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/agni1.wav
Skipping existing item: file://./datasets/speech-accent-archive/recordings/afrikaans3.wav
Skipping existing item: fil

## wav2vec 2.0 + 4-gram on accent dataset

In [5]:
model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-960h-4-gram").to(device)
processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2-base-960h-4-gram")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at patrickvonplaten/wav2vec2-base-960h-4-gram and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
files = os.listdir('datasets/speech-accent-archive/recordings/')

hypotheses = []
batch = 2
i = 0

while True:
  end = min(len(files), i + batch)
  audio_batch = []
  for file in files[i:end]:
    audio_input, _ = sf.read('./datasets/speech-accent-archive/recordings/' + file)
    audio_batch.append(audio_input)

  input_values = processor(audio_batch, sampling_rate=16_000, return_tensors="pt", padding=True).input_values

  with torch.no_grad():
    logits = model(input_values.to(device)).logits
  predicted_ids = torch.argmax(logits, dim=-1)
  del logits
  torch.cuda.empty_cache()
  transcription = processor.batch_decode(predicted_ids)

  hypotheses.extend([transcription])
  print('Progress:', 100 * end / len(files))
  if end == len(files):
    break
  i = end

Progress: 0.09354536950420954
Progress: 0.18709073900841908
Progress: 0.2806361085126286
Progress: 0.37418147801683815
Progress: 0.4677268475210477
Progress: 0.5612722170252572
Progress: 0.6548175865294668
Progress: 0.7483629560336763
Progress: 0.8419083255378859
Progress: 0.9354536950420954
Progress: 1.028999064546305
Progress: 1.1225444340505144
Progress: 1.216089803554724
Progress: 1.3096351730589335
Progress: 1.4031805425631432
Progress: 1.4967259120673526
Progress: 1.5902712815715623
Progress: 1.6838166510757717
Progress: 1.7773620205799814
Progress: 1.8709073900841908
Progress: 1.9644527595884003
Progress: 2.05799812909261
Progress: 2.1515434985968196
Progress: 2.245088868101029
Progress: 2.3386342376052385
Progress: 2.432179607109448
Progress: 2.525724976613658
Progress: 2.619270346117867
Progress: 2.7128157156220767
Progress: 2.8063610851262863
Progress: 2.899906454630496
Progress: 2.9934518241347052
Progress: 3.086997193638915
Progress: 3.1805425631431246
Progress: 3.274087932

In [7]:
#define a patch for the pickle file on your disk
pick_path = 'datasets/hyp4g.pkl'

# dump a list to the pickle
with open (pick_path, 'wb') as pick:
    pickle.dump(hypotheses, pick)

In [8]:
pick_path = 'datasets/hyp4g.pkl'
#define an empty dictionary
hypotheses =[]

with open (pick_path, 'rb') as pick:
    hypotheses.append(pickle.load(pick))

In [9]:
predictions = []
for batch in hypotheses[0]:
    predictions.extend(batch)

ref_text='Please call Stella.  Ask her to bring these things with her from the store:  Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.  We also need a small plastic snake and a big toy frog for the kids.  She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.'
references = [ref_text for i in predictions]

In [10]:
data = pd.DataFrame(dict(hypothesis=predictions, reference=references))
data

Unnamed: 0,hypothesis,reference
0,PLEASE CAL STELLA ASK HER TO BRING THESE THING...,Please call Stella. Ask her to bring these th...
1,PLEASE CALISTELLA ASK HER TO BRING THIS THING ...,Please call Stella. Ask her to bring these th...
2,PLEASE CALL STELLA ASK HER TO BRING THESE THIN...,Please call Stella. Ask her to bring these th...
3,PLEASE COURSE TELA ASK HER TO PURING THESE THI...,Please call Stella. Ask her to bring these th...
4,PLEASE CALL STELLA ASK HER TO BRING THESE THIN...,Please call Stella. Ask her to bring these th...
...,...,...
2133,POLISE CALLED STELDA ASK HER TO BRING THESE TH...,Please call Stella. Ask her to bring these th...
2134,PLEASE CALL STALA ASK HER TO BRING THESE THING...,Please call Stella. Ask her to bring these th...
2135,PLEASE CALL STEILER ASK HER ABOUT TO BRING THE...,Please call Stella. Ask her to bring these th...
2136,PLEASE CALL STELLA ASK HER TO BRING THESE THIN...,Please call Stella. Ask her to bring these th...


In [11]:
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,PLEASE CAL STELLA ASK HER TO BRING THESE THING...,Please call Stella. Ask her to bring these th...,please cal stella ask her to bring these thing...,please call stella ask her to bring these thin...
1,PLEASE CALISTELLA ASK HER TO BRING THIS THING ...,Please call Stella. Ask her to bring these th...,please calistella ask her to bring this thing ...,please call stella ask her to bring these thin...
2,PLEASE CALL STELLA ASK HER TO BRING THESE THIN...,Please call Stella. Ask her to bring these th...,please call stella ask her to bring these thin...,please call stella ask her to bring these thin...
3,PLEASE COURSE TELA ASK HER TO PURING THESE THI...,Please call Stella. Ask her to bring these th...,please course tela ask her to puring these thi...,please call stella ask her to bring these thin...
4,PLEASE CALL STELLA ASK HER TO BRING THESE THIN...,Please call Stella. Ask her to bring these th...,please call stella ask her to bring these thin...,please call stella ask her to bring these thin...
...,...,...,...,...
2133,POLISE CALLED STELDA ASK HER TO BRING THESE TH...,Please call Stella. Ask her to bring these th...,polise called stelda ask her to bring these th...,please call stella ask her to bring these thin...
2134,PLEASE CALL STALA ASK HER TO BRING THESE THING...,Please call Stella. Ask her to bring these th...,please call stala ask her to bring these thing...,please call stella ask her to bring these thin...
2135,PLEASE CALL STEILER ASK HER ABOUT TO BRING THE...,Please call Stella. Ask her to bring these th...,please call steiler ask her about to bring the...,please call stella ask her to bring these thin...
2136,PLEASE CALL STELLA ASK HER TO BRING THESE THIN...,Please call Stella. Ask her to bring these th...,please call stella ask her to bring these thin...,please call stella ask her to bring these thin...


In [12]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 21.55 %


## WER by Country

In [13]:
files = os.listdir('datasets/speech-accent-archive/recordings/')

In [14]:
import re

languages = []
for file in files:
    language = file.split(".")[0]
    language = re.sub(r'[0-9]+', '', language)
    languages.append(language)

In [15]:
# calculate wer by row
data['wer'] = data.apply(lambda row : jiwer.wer(row["reference_clean"], row["hypothesis_clean"]), axis = 1)

In [16]:
wers = data.drop(data.columns[[0, 1, 2, 3]], axis=1)
wers = pd.concat([pd.DataFrame(languages, columns=['language']), wers], axis=1)
wers

Unnamed: 0,language,wer
0,english,0.057971
1,portuguese,0.260870
2,english,0.144928
3,mandarin,0.420290
4,swedish,0.115942
...,...,...
2133,turkish,0.333333
2134,english,0.086957
2135,urdu,0.217391
2136,english,0.057971


In [17]:
wers_country = wers.groupby('language', as_index=False).agg(['mean', 'count'])
wers_country.columns = wers_country.columns.droplevel(0)
wers_country.reset_index()

Unnamed: 0,language,mean,count
0,afrikaans,0.113043,5
1,agni,0.420290,1
2,akan,0.405797,1
3,albanian,0.186795,9
4,amazigh,0.362319,2
...,...,...,...
195,yapese,0.159420,1
196,yiddish,0.249275,5
197,yoruba,0.289855,5
198,yupik,0.159420,1


In [18]:
# most frequent

wers_country.sort_values(by=['count'], ascending=False)

Unnamed: 0_level_0,mean,count
language,Unnamed: 1_level_1,Unnamed: 2_level_1
english,0.094040,579
spanish,0.269637,162
arabic,0.269963,102
mandarin,0.325975,65
french,0.238785,63
...,...,...
nandi,0.362319,1
nama,0.318841,1
mortlockese,0.159420,1
moore,0.115942,1


In [19]:
# sorted by wer

wers_country.sort_values(by=['mean'])

Unnamed: 0_level_0,mean,count
language,Unnamed: 1_level_1,Unnamed: 2_level_1
frisian,0.028986,1
papiamentu,0.043478,2
tatar,0.057971,1
irish,0.057971,1
shona,0.057971,2
...,...,...
newari,0.753623,1
chittagonian,0.753623,1
bai,0.782609,1
sylheti,0.869565,1
