In [None]:
# !apt update
# !apt -y install ffmpeg

# Install this part from terminal using sudo for the password

In [None]:
import pandas as pd
import os
import re
# from tqdm.auto import tqdm
from tqdm.autonotebook import tqdm as notebook_tqdm
notebook_tqdm.pandas()

In [None]:
cwd = os.getcwd()
cwd

In [None]:
root_path = cwd.removesuffix('notebooks/jupyter_notebooks')
root_path

In [None]:
df = pd.read_csv(root_path + 'raw_data/training_data/asr.csv') #your dataset here
df.head(20)

In [None]:
list_lang = list(df['lang'].unique()) 
list_lang

'xas' has around 75000 entries, so we skip it for the processing

In [None]:
list_lang = ['bak', 'evn', 'mhr', 'tat', 'sah']

In [None]:

new_dir = root_path + 'processed_data' #new directory for cut files
if os.path.exists(new_dir) is False:
    os.mkdir(new_dir)
else:
    print('folder already exists')
    
list_lang = list(df['lang'].unique())    
    
# Create sub-folders with respective lang codes    
    
for l in list_lang:
    
    new_dir_ext = new_dir + f'/{l}'
    
    if os.path.exists(new_dir_ext) is False:
        os.mkdir(new_dir_ext)
    else:
        print('folder already exists')

In [None]:
df["fpath"] = root_path + 'raw_data/training_data/' + df["source"].astype(str)
df

In [None]:
def replacer(path):
    return path.replace(' ', '_')
df['fpath'] = df['fpath'].apply(replacer)
df = df.reset_index() #adding indexes(id)

In [None]:
!mkdir -p {root_path}/processed_data/ffmpeg_log

def cutter(row): # cutting files accroding to timecodes
    
    fpath, start, end, index, lang = row["fpath"], row["start"], row["end"], row["index"], row["lang"]
    
    if pd.isna(row.transcription) == False:    
        
        if pd.isna(row.start) or pd.isna(row.end):
            
            !ffmpeg -n -i {fpath} -ar 16000 \
            {root_path + 'processed_data/' + str(lang) + "/" + str(index)}.mp3 \
            2> {root_path}/processed_data/ffmpeg_log/{index}.log
        
        else:
        
            !ffmpeg -n -i {fpath} -ss {str(start)} -to {str(end)} -ar 16000 \
            {root_path + 'processed_data/' + str(lang) + "/" + str(index)}.mp3 \
            2> {root_path}/processed_data/ffmpeg_log/{index}.log

            
for l in ['sah']:
    
    df_l = df[df['lang']==l]
    df_l.progress_apply(cutter, axis=1)
    
    df_l = df_l.dropna(subset=['transcription'])
    
    # making column for paths of cut files
    df_l['file_name'] = df_l['index'].apply(lambda x: str(x) + '.mp3')
    
    
    # cleaning transcriptions
    # df_l['transcription_clean'] = df_l['transcription'].apply(lambda x: x.strip('.«,').replace('=', '').replace(' ', '').replace('Ø', ' ')
    df_l['transcription_clean'] = df_l['transcription'].apply(lambda x: x.strip('.«,').replace('=', '').replace('Ø', ' ')) # clearing punctuation marks and spaces
    df_l['transcription_clean'] = df_l['transcription_clean'].apply(lambda x: re.sub('\(.+?\)', '', x))
   
    # filling empty strings
    df_l['transcription_clean'] = df_l['transcription_clean'].apply(lambda s: s if s else '-')
        
    # saving file
    df_l.to_csv(f'{root_path}/processed_data/{l}/asr_{l}.csv', index=False)
    
    # dropping columns which are not needed in the metadata file
    df_l.drop(columns=['fpath', 'id', 'start', 'end', 'index', 'source', 'lang'], inplace=True)
    
    df_l = df_l[['file_name', 'transcription']]
    df_l.to_csv(f'{root_path}/processed_data/{l}/metadata.csv', index=False)

Some analysis for Evenki language transcripts : we have 4 cases where the person speaks only Russian, so transcription is set to NaN.

In [None]:
df_evn = df[df['lang'] == 'evn']

num_mask = pd.to_numeric(df_evn['transcription'], errors='coerce').isnull()

df_evn.loc[num_mask, 'transcription']

print(df_evn.transcription[df_evn.transcription.isna()]) 

Some analysis for Bashkir Language transcripts : we have cases where start and end of recording are set to NaN, this is for very short audio, i.e. there is no need to cut the audio --> in the preprocessing we don't cut and skip them

In [None]:
df_bak = df[df['lang'] == 'bak']

print(df_bak.start[df_bak.start.isna()]) 

In [None]:
num_mask = pd.to_numeric(df_bak['transcription'], errors='coerce').isnull()

df_bak.loc[num_mask, 'transcription']

print(df_bak.transcription[df_bak.transcription.isna()]) 

Some analysis for Tatar language

In [None]:
df_tat = df[df['lang'] == 'tat']

print(df_tat.start[df_tat.start.isna()]) 

In [None]:
num_mask = pd.to_numeric(df_tat['transcription'], errors='coerce').isnull()

df_tat.loc[num_mask, 'transcription']

print(df_tat.transcription[df_tat.transcription.isna()]) 

After this audio cutting procedure also all the audio files are normalized to have 16Hz