In [1]:
# import os
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import json
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
import torchaudio
# import random
# import IPython.display as ipd
# import numpy as np


In [2]:
DATASET_PATH = r'/kaggle/input/imasc/'

## Dataset functions 

In [3]:
def get_full_dataset():
    dataset = pd.read_csv(DATASET_PATH+'metadata.csv', sep='|')
    dataset.loc[len(dataset.index)]=[dataset.columns[0],dataset.columns[1],dataset.columns[2]]
    dataset.rename(columns={dataset.columns[0]:'speaker',dataset.columns[1]:'file',dataset.columns[2]:'transcript'},inplace=True)
    return dataset

In [4]:
chars_to_ignore_regex = "[\,\?\.\!\-\;\:\"\'\“\%\‘\”\_]"
# unicode_ignore_regex = r'[\u200e]'
def remove_special_chars(batch):
    batch['transcript'] = batch['transcript'].str.strip()
    batch['transcript'] = batch['transcript'].replace(chars_to_ignore_regex,'', regex=True)
    return batch
    

In [5]:
get_wav_path = lambda speaker,file : "%s%s/wavs/%s.wav"%(DATASET_PATH, speaker, file)

In [6]:
dataset = get_full_dataset()

In [7]:
dataset_splits = train_test_split(dataset,test_size=0.2, train_size=0.8, random_state = 17, shuffle=True)

In [8]:
dataset=remove_special_chars(dataset)

In [9]:
def extract_all_chars(batch):
  all_text = " ".join(batch['transcript'])
  vocab = set(all_text)
  return vocab


### Uncomment the next 4 segments on the first run 

In [10]:
# char_map = {v:k for k,v in enumerate(extract_all_chars(dataset))}
# # print(char_map)

In [11]:
# char_map["|"] = char_map[" "]
# del char_map[" "]
# print(char_map)

In [12]:
# char_map["[UNK]"] = len(char_map)
# char_map["[PAD]"] = len(char_map)
# len(char_map)

In [13]:
# with open('chars.json','w') as chars_file:
#     json.dump(char_map, chars_file)

In [12]:
with open('/kaggle/input/icfoss-model/chars.json') as chars_file:
    chars_map= json.load(chars_file)
chars_map

{'ത': 0,
 'ഹ': 2,
 'ഊ': 3,
 'ച': 4,
 'ല': 5,
 'ഫ': 6,
 'ഗ': 7,
 'ബ': 8,
 'ക': 9,
 '്': 10,
 'ദ': 11,
 'സ': 12,
 'ഇ': 13,
 'ഓ': 14,
 'ഋ': 15,
 'ണ': 16,
 '\u200c': 17,
 'ം': 18,
 'ഏ': 19,
 'ഒ': 20,
 'ഉ': 21,
 'ൈ': 22,
 'ട': 23,
 'േ': 24,
 'ഞ': 25,
 'ൂ': 26,
 'െ': 27,
 'അ': 28,
 'ഃ': 29,
 'ഥ': 30,
 'വ': 31,
 '\u200d': 32,
 'ര': 33,
 'ോ': 34,
 'മ': 35,
 'ള': 36,
 'ൃ': 37,
 'ഖ': 38,
 'ന': 39,
 'ജ': 40,
 'പ': 41,
 'ഢ': 42,
 'ഡ': 43,
 'ഷ': 44,
 'ങ': 45,
 'ധ': 46,
 'റ': 47,
 'ൌ': 48,
 'എ': 49,
 'ാ': 50,
 'ൊ': 51,
 'ഐ': 52,
 'ൗ': 53,
 'ഈ': 54,
 'ഛ': 55,
 'ഭ': 56,
 'ഔ': 57,
 'ീ': 58,
 'ു': 59,
 'ഘ': 60,
 'ഠ': 61,
 'ഴ': 62,
 'ൻ': 63,
 'ൺ': 64,
 'ൽ': 65,
 'യ': 66,
 'ശ': 67,
 'ർ': 68,
 'ൾ': 69,
 'ആ': 70,
 'ഝ': 71,
 'ി': 72,
 '|': 1,
 '[UNK]': 73,
 '[PAD]': 74}

### Uncomment next 5 snippets on first run

In [14]:
# tokenizer = Wav2Vec2CTCTokenizer("./chars.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
# tokenizer

In [15]:
# feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
# feature_extractor

In [10]:
processor = Wav2Vec2Processor.from_pretrained('/kaggle/input/icfoss-model')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
# processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [22]:
# processor.save_pretrained('model')

In [11]:
# train_data = dataset_splits[0]
test_data = dataset_splits[1]
# del dataset_splits

In [15]:
def speech_file_to_array_fn(batch):
    speech_array, _= torchaudio.load(get_wav_path(batch['speaker'],batch['file']))
    batch["speech"] = speech_array[0].numpy()
    return batch


In [13]:
def prep_frame(dataframe):
    dataframe=dataframe.apply(lambda row:speech_file_to_array_fn(row),axis=1)
    dataframe.drop(['speaker','file'],axis=1,inplace = True)
    dataframe.rename(columns={'transcript':'target_text'},inplace = True);
    dataframe.reset_index(drop=True,inplace = True)
    return dataframe
    

In [18]:
train_data=prep_frame(train_data)


In [20]:
import gc
gc.collect()

23

In [14]:
test_data=prep_frame(test_data)

In [28]:
rand_int = random.randint(0, len(test_data)-1)
ipd.Audio(data=np.asarray(test_data.loc[rand_int]["speech"]), autoplay=False, rate=16000)


In [21]:
sr=16000;
def prepare_dataset(frame):
    frame["input_values"] = processor(frame["speech"], sampling_rate=sr).input_values[0]
    with processor.as_target_processor():
        frame["labels"] = processor(frame["target_text"]).input_ids
    return frame


In [22]:
train_data = train_data.apply(lambda row: prepare_dataset(row),axis=1)


In [23]:
train_data

Unnamed: 0,target_text,speech,input_values,labels
0,സമസ്ത അധ്യക്ഷൻ ഫഖ്റുദ്ദീൻ തങ്ങൾ സമാപന പ്രസംഗം ...,"[0.0055236816, 0.005432129, 0.005432129, 0.005...","[0.06451574, 0.06344566, 0.06344566, 0.0634456...","[12, 35, 12, 10, 0, 1, 28, 46, 10, 66, 9, 10, ..."
1,പട്ടണത്തിൽ ഒരു ജനറൽ സ്റ്റോറും തപാലോഫീസും പ്രവർ...,"[0.0012512207, 0.0014953613, 0.0016784668, 0.0...","[0.013414312, 0.016056163, 0.018037552, 0.0177...","[41, 23, 10, 23, 16, 0, 10, 0, 72, 65, 1, 20, ..."
2,പാർട്ടിയിൽ ചേർന്നതിനു പിന്നാലെ ഉയർത്തിയ കലാപക്...,"[-3.0517578e-05, -3.0517578e-05, 0.0, 0.0, 0.0...","[-0.00031907205, -0.00031907205, 0.0002029179,...","[41, 50, 68, 23, 10, 23, 72, 66, 72, 65, 1, 4,..."
3,അതിനുമുൻപ് ഇതൊരു ദ്രാവിഡീയക്ഷേത്രവും പിന്നീട് ...,"[-0.00012207031, -0.00024414062, -0.0002746582...","[-0.00325805, -0.0063397484, -0.007110173, -0....","[28, 0, 72, 39, 59, 35, 59, 63, 41, 10, 1, 13,..."
4,ആദ്യത്തെ ലെഫ്റ്റനന്റ് ഗവർണറായി ജോണിനെ നിയമിക്ക...,"[-0.000579834, -0.0010070801, -0.0009460449, -...","[-0.008921764, -0.015551624, -0.0146045005, -0...","[70, 11, 10, 66, 0, 10, 0, 27, 1, 5, 27, 6, 10..."
5,ഔദ്യോഗിക വെബ്സൈറ്റിലും ഇതുസംബന്ധിച്ച് അറിയിപ്പ...,"[0.008392334, 0.008392334, 0.008300781, 0.0082...","[0.09528789, 0.09528789, 0.09424668, 0.0932054...","[57, 11, 10, 66, 34, 7, 72, 9, 1, 31, 27, 8, 1..."
6,"വരുംദിവസങ്ങളിൽ ജൈവകർഷകസദസ്സ്, ഫോക് മെഗാഷോ എന്ന...","[0.0004272461, 0.00076293945, 0.0006713867, 0....","[0.011046019, 0.019643787, 0.01729894, 0.01886...","[31, 33, 59, 18, 11, 72, 31, 12, 45, 10, 45, 3..."
7,കലാപം കാരണം അഭയാർത്ഥികൾ തെക്കോട്ട് നീങ്ങുകയുമു...,"[0.0002746582, 0.0008239746, 0.00021362305, -0...","[0.0030928494, 0.009364241, 0.0023960283, -0.0...","[9, 5, 50, 41, 18, 1, 9, 50, 33, 16, 18, 1, 28..."
8,ചരിത്രം അത് പാലിയം എന്ന കുടുംബവുമായി ബന്ധപ്പെട...,"[0.023590088, 0.023925781, 0.024169922, 0.0244...","[0.25825748, 0.26196063, 0.2646538, 0.26768366...","[4, 33, 72, 0, 10, 33, 18, 1, 28, 0, 10, 1, 41..."
9,അമേരിക്കയിലെ വിദഗ്ദ്ധന്മാർ ഇതുവരെ അർഗ്രാരിയൻ ഭ...,"[0.0, 0.0, -6.1035156e-05, 0.0, 0.0, 0.0, 0.0,...","[1.1731718e-05, 1.1731718e-05, -0.00075405074,...","[28, 35, 24, 33, 72, 9, 10, 9, 66, 72, 5, 27, ..."


In [23]:
test_data = test_data.apply(lambda row: prepare_dataset(row),axis=1)

In [22]:
test_data.head()

Unnamed: 0,target_text,speech
0,നിങ്ങളുടെ പിതാവിന്റെ ആഗ്രഹങ്ങൾ നിറവേറ്റാനാണ് ന...,"[0.00018310547, 0.00033569336, 0.00021362305, ..."
1,എന്റെ കയ്യിൽ അത്തരം പേരിലുള്ള ഒന്നും ഇല്ല,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,വ്യത്യസ്ത ഇനം ഓർക്കിഡുകളും ദേശീയോദ്യാനത്തിൽ കാ...,"[0.00021362305, 0.00015258789, 9.1552734e-05, ..."
3,മത്സ്യബന്ധനമാണ് പട്ടണത്തിലെ ആളുകളുടെ പ്രധാന തൊഴിൽ,"[-0.00036621094, 0.0, -0.000579834, -0.0015258..."
4,ബിയറിന്റെ സാമ്പിളിൽ നിന്ന് മെഥനൊൾ കണ്ടെത്താൻ ക...,"[-0.00048828125, -0.00088500977, -0.0006713867..."


In [24]:
test_data.head()

Unnamed: 0,target_text,speech,input_values,labels
0,നിങ്ങളുടെ പിതാവിന്റെ ആഗ്രഹങ്ങൾ നിറവേറ്റാനാണ് ന...,"[0.00018310547, 0.00033569336, 0.00021362305, ...","[0.0019598715, 0.0034763117, 0.0022631595, 0.0...","[39, 72, 45, 10, 45, 36, 59, 23, 27, 1, 41, 72..."
1,എന്റെ കയ്യിൽ അത്തരം പേരിലുള്ള ഒന്നും ഇല്ല,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-2.2516522e-05, -2.2516522e-05, -2.2516522e-0...","[49, 39, 10, 47, 27, 1, 9, 66, 10, 66, 72, 65,..."
2,വ്യത്യസ്ത ഇനം ഓർക്കിഡുകളും ദേശീയോദ്യാനത്തിൽ കാ...,"[0.00021362305, 0.00015258789, 9.1552734e-05, ...","[0.00188044, 0.0013312843, 0.0007821285, 0.000...","[31, 10, 66, 0, 10, 66, 12, 10, 0, 1, 13, 39, ..."
3,മത്സ്യബന്ധനമാണ് പട്ടണത്തിലെ ആളുകളുടെ പ്രധാന തൊഴിൽ,"[-0.00036621094, 0.0, -0.000579834, -0.0015258...","[-0.0036230274, -1.6695978e-05, -0.0057267207,...","[35, 0, 10, 12, 10, 66, 8, 39, 10, 46, 39, 35,..."
4,ബിയറിന്റെ സാമ്പിളിൽ നിന്ന് മെഥനൊൾ കണ്ടെത്താൻ ക...,"[-0.00048828125, -0.00088500977, -0.0006713867...","[-0.0061827693, -0.011047271, -0.008427924, -0...","[8, 72, 66, 47, 72, 39, 10, 47, 27, 1, 12, 50,..."


In [26]:
# test_data.loc[:50].info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   target_text   51 non-null     object
 1   speech        51 non-null     object
 2   input_values  51 non-null     object
 3   labels        51 non-null     object
dtypes: object(4)
memory usage: 15.4 MB


In [23]:
# import hashlib
# hashlib.sha1(pd.util.hash_pandas_object(test_data.loc[:50]['labels']).values).hexdigest() 


In [40]:
import sys
sys.getsizeof(test_data)/1000000

6.84403

In [20]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)


In [21]:
train_data.shape

(27578, 4)

In [22]:
train_data.shape

(13000, 4)

In [1]:
train_data.head()

NameError: name 'train_data' is not defined

In [24]:
train_data.to_pickle('train_data.pkl')

In [27]:
test_data.to_pickle('test_data_final.pkl')

In [34]:
# import hashlib
# hashlib.sha1(pd.util.hash_pandas_object(test_data.loc[:50]).values).hexdigest() 

In [None]:
train_data.to_pickle('train_data.pkl')

In [23]:
import os
os.chdir(r'/kaggle/working')

In [25]:
from IPython.display import FileLink 
FileLink(r'train_data.pkl')

In [34]:
# train_data.to_csv('train_data3.csv',index=False)

type(test_data['labels'][0])

len(test_data['input_values'].loc[3][0])

a

a.dtype