# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [None]:
import pandas as pd
import os
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
!apt update
!pip install transformers datasets phonemizer
!apt install espeak
!pip install pydub
!pip install transformers --upgrade
!pip install torchaudio
!pip install tqdm --upgrade
!pip install torchaudio --upgrade

[33m0% [Working][0m            Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
[33m0% [Connecting to archive.ubuntu.com] [Connecting to cloud.r-project.org (108.1[0m[33m0% [1 InRelease gpgv 88.7 kB] [Connecting to archive.ubuntu.com] [Connecting to[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
[33m0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Wait[0m                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
[33m0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Wait[0m                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
[33m                                                         

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

In [None]:
from transformers import __version__ as transformers_ver
from tqdm import __version__ as tqdm_ver
from torch import __version__ as torch_ver
from torchaudio import __version__ as torchaudio_ver
from pandas import __version__ as pd_ver
print(f"transformers_ver:\t{transformers_ver}")
print(f"tqdm_ver:\t{tqdm_ver}")
print(f"torch_ver:\t{torch_ver}")
print(f"torchaudio_ver:\t{torchaudio_ver}")
print(f"pandas_ver:\t{pd_ver}")

In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

# model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")

In [None]:
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")

In [None]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

zip_ref = zipfile.ZipFile("/content/drive/My Drive/tat.zip", 'r')
zip_ref.extractall("/content/asr_tat")
zip_ref.close()

In [None]:
import pandas as pd

df = pd.read_csv('/content/asr_tat/tat/asr_tat.csv')
df

In [None]:
df['fpath'] = df['file_name'].map(lambda x: '/content/asr_tat/tat/' + x)
df['fpath']

In [None]:
df

In [None]:
import torch
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
!conda install ffmpeg=4.3 -c conda-forge

In [None]:
import torch
import torchaudio
from tqdm.auto import tqdm
tqdm.pandas()
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

def recognizer(fpath):
    try:
        waveform, sample_rate = torchaudio.load(fpath)
        waveform = waveform.to(device)
        logits = model(waveform).logits
        pred_ids = torch.argmax(logits, dim=-1)
        pred_str = processor.batch_decode(pred_ids)[0]
        return pred_str
    except:
        return 0

In [None]:
df['recognized'] = df['fpath'].progress_apply(recognizer)

In [None]:
df.head(5)

In [None]:
!pip install abydos

import abydos

from abydos import distance

phonetic = distance.PhoneticEditDistance()

In [None]:
phonetic = distance.PhoneticEditDistance()
def phonetic_metric(row):
    try:
        result = phonetic.dist(row['transcription'], row['recognized'])
        return result
    except Exception as e:
        print(e)

In [None]:
df['phonetic_ev'] = df.progress_apply(phonetic_metric, axis=1)

In [None]:
df[['transcription', 'recognized', 'phonetic_ev']].sample(10)

In [None]:
df.phonetic_ev.plot.hist(bins=20)

In [None]:
df[['transcription', 'recognized']]

In [None]:
https://github.com/flashlight/flashlight/tree/main/flashlight/app/asr#beam-search-decoder

In [None]:
uselexicon=true

In [None]:
df.to_csv('/content/asr_tat/tat/asr_tat_post_model.csv', index=False)

In [None]:
def recognizer_adv(fpath):
    try:
        waveform, sample_rate = torchaudio.load(fpath)
        waveform = waveform.to(device)
        logits = model(waveform).logits
        pred_ids = torch.argmax(logits, dim=-1)
        pred_str = processor.batch_decode(pred_ids, uselexicon=true)[0]
        return pred_str
    except:
        return 0

In [None]:
df['recognized_adv'] = df['fpath'].progress_apply(recognizer_adv)

In [None]:
def phonetic_metric_adv(row):
    try:
        result = phonetic.dist(row['transcription'], row['recognized_adv'])
        return result
    except Exception as e:
        print(e)

In [None]:
df['phonetic_ev_adv'] = df.progress_apply(phonetic_metric_adv, axis=1)

In [None]:
df[['transcription', 'recognized_adv', 'phonetic_ev_adv']].sample(10)

In [None]:
df.phonetic_ev_adv.plot.hist(bins=20)

In [None]:
df.to_csv('/content/asr_tat/tat/asr_tat_post_model.csv', index=False)