In [None]:
!apt update
!apt -y install ffmpeg

In [None]:
!wget -c https://files.deeppavlov.ai/field-matters/releases/demo/asr_data.csv
!wget -c https://files.deeppavlov.ai/field-matters/releases/demo/sound.zip
!unzip sound.zip

In [None]:
import pandas as pd
import os
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
df = pd.read_csv('asr_data.csv') #your dataset here

In [None]:
df["fpath"] = './audio_to_release/' + df["lang"].astype(str) + "/" + df["source"].astype(str)
df

In [None]:
#adding paths to file to dataset
df["fpath"] = './audio_to_release/' + df["lang"].astype(str) + "/" + df["source"].astype(str)
#removing spaces in dataset's paths
cache = {}
def fix_path(path):
    try:
        new_path = path.replace(' ', '_')
        new_path = cache.setdefault(path, new_path)
        if not os.path.exists(new_path):
            os.rename(path, new_path)
        return new_path
    except Exception as e:
        print(e)
df['fpath'] = df['fpath'].apply(fix_path)

df = df.reset_index() #adding indexes(id)

In [None]:
def replacer(path):
    return path.replace(' ', '_')
df['fpath'] = df['fpath'].apply(replacer)
df = df.reset_index() #adding indexes(id)
df

In [None]:
new_dir = './new_audio' #new directory for cut files
if os.path.exists(new_dir) is False:
    os.mkdir(new_dir)
else:
    print('folder already exists')

In [None]:
!mkdir -p ffmpeg_log

def cutter(row): #cutting files accroding to timecodes
    fpath, start, end, index = row["fpath"], row["start"], row["end"], row["index"]
    !ffmpeg -n -i {fpath} -ss {str(start)} -to {str(end)} -ar 16000 \
     {'./new_audio/' + str(index)}.wav \
     2> ffmpeg_log/{index}.log

    
df.progress_apply(cutter, axis=1)

In [None]:
#making column for paths of cut files
df['new_path'] = df['index'].apply(lambda x: './new_audio/' + str(x) + '.wav')

# ASR

In [None]:
!apt update
!pip install transformers datasets phonemizer
!apt install espeak
!pip install pydub
!pip install transformers --upgrade
!pip install torchaudio
!pip install tqdm --upgrade
!pip install torchaudio --upgrade

In [None]:
from tqdm import auto

In [None]:
from transformers import __version__ as transformers_ver
from tqdm import __version__ as tqdm_ver
from torch import __version__ as torch_ver
from torchaudio import __version__ as torchaudio_ver
from pandas import __version__ as pd_ver
print(f"transformers_ver:\t{transformers_ver}")
print(f"tqdm_ver:\t{tqdm_ver}")
print(f"torch_ver:\t{torch_ver}")
print(f"torchaudio_ver:\t{torchaudio_ver}")
print(f"pandas_ver:\t{pd_ver}")

In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")

In [None]:
import os; cuda_num = os.getenv("CUDA_VISIBLE_DEVICES")

!nvidia-smi -i {cuda_num}

In [None]:
import torch
import torchaudio
from tqdm.auto import tqdm
tqdm.pandas()
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

def recognizer(fpath):
    try:
        waveform, sample_rate = torchaudio.load(fpath)
        waveform = waveform.to(device)
        logits = model(waveform).logits
        pred_ids = torch.argmax(logits, dim=-1)
        pred_str = processor.batch_decode(pred_ids)[0]
        return pred_str
    except:
        return 0

In [None]:
df['recognized'] = df['new_path'].progress_apply(recognizer)

In [None]:
df['transcription'] = df['transcription'].apply(lambda x: x.strip('.«,').replace('=', '').replace(' ', '').replace('Ø', ' '))#clearing punctuation marks and spaces
df['transcription'] = df['transcription'].apply(lambda x: re.sub('\(.+?\)', '', x))
df['recognized'] = df['recognized'].str.replace(' ','')

In [None]:
#filling empty strings
df['transcription'] = df['transcription'].apply(lambda s: s if s else '-')

In [None]:
df.to_csv('asr.csv')

In [None]:
df[["transcription", "recognized"]].sample(10)

# Evaluation

In [None]:
!pip install abydos

In [None]:
from abydos import distance

In [None]:
phonetic = distance.PhoneticEditDistance()

In [None]:
phonetic = distance.PhoneticEditDistance()
def phonetic_metric(row):
    try:
        result = phonetic.dist(row['transcription'], row['recognized'])
        return result
    except Exception as e:
        print(e)

In [None]:
df['phonetic_ev'] = df.progress_apply(phonetic_metric, axis=1)

In [None]:
df[['transcription', 'recognized', 'phonetic_ev']].sample(10)

In [None]:
df.phonetic_ev.plot.hist(bins=50)