In [2]:
#-- required imports
import tensorflow as tf
from tensorflow.keras import regularizers as rg
import librosa
from librosa import display
from scipy.io import wavfile
import gc
import pickle as pkl
from tqdm.notebook import tqdm, trange
import matplotlib.pyplot as plt

path = '/kaggle/input/darpa-timit-acousticphonetic-continuous-speech'
data_path = path+"/data"

2021-12-12 23:14:59.160625: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-12-12 23:14:59.160744: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
class Callback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self,epoch,logs={}):
        print("Epoch ",epoch)

    def on_epoch_end(self,epoch,logs={}):
        print('loss: {:.2f}, accuracy:{:.2f}'.format(
                logs["loss"],logs["accuracy"]*100))
        print(logs)
        gc.collect()

    def on_batch_end(self,batch,logs={}):
        if(batch%100 == 0):
            print(batch,'loss: {:.2f}, accuracy:{:.2f}'.format(
                logs["loss"],logs["accuracy"]*100))

    def on_test_batch_end(self, batch, logs=None):        
        if(batch%100 == 0):
            pass
            return

In [4]:
import math

class DNN_MODULE_BUILDER():
    __train_desc = 'train_data.csv'
    __test_desc = 'test_data.csv'
    __data_directory = './data'
    __main_directory = './'
    f_Path = 'path_from_data_dir' #field that contains file path in train_data.csv
    f_IsAudio = 'is_converted_audio' #boolean field that tells that the record in train_data.csv contains the description of audio file we are interested in
    f_IsPhon = 'is_phonetic_file'
    # f_filename = 'filename' #field that contains filename
    f_dr = 'dialect_region' #field that contains dialect_region information
    _winlen = 0.025
    _winstep = 0.01
    
    
    def __init__(self,path=None):
        self.__main_directory = path
        if path[len(path)-1] == '/':
            self.__data_directory = path+"data/"
        else:
            self.__main_directory += "/"
            self.__data_directory = self.__main_directory+"data/"
      
        self.phon61_map39 = {
            'iy':'iy',  'ih':'ih',   'eh':'eh',  'ae':'ae',    'ix':'ih',  'ax':'ah',   'ah':'ah',  'uw':'uw',
            'ux':'uw',  'uh':'uh',   'ao':'aa',  'aa':'aa',    'ey':'ey',  'ay':'ay',   'oy':'oy',  'aw':'aw',
            'ow':'ow',  'l':'l',     'el':'l',  'r':'r',      'y':'y',    'w':'w',     'er':'er',  'axr':'er',
            'm':'m',    'em':'m',     'n':'n',    'nx':'n',     'en':'n',  'ng':'ng',   'eng':'ng', 'ch':'ch',
            'jh':'jh',  'dh':'dh',   'b':'b',    'd':'d',      'dx':'dx',  'g':'g',     'p':'p',    't':'t',
            'k':'k',    'z':'z',     'zh':'sh',  'v':'v',      'f':'f',    'th':'th',   's':'s',    'sh':'sh',
            'hh':'hh',  'hv':'hh',   'pcl':'h#', 'tcl':'h#', 'kcl':'h#', 'qcl':'h#','bcl':'h#','dcl':'h#',
            'gcl':'h#','h#':'h#',  '#h':'h#',  'pau':'h#', 'epi': 'h#','nx':'n',   'ax-h':'ah','q':'h#' 
        }
        
        self.phon61 = list(self.phon61_map39.keys())
        self.phon39 = list(set(self.phon61_map39.values()))

        self.label_p39 = {}
        self.p39_label = {}
        for i,p in enumerate(self.phon39):
            self.label_p39[p] = i+1
            self.p39_label[i+1] = p

        self.phon39_map61 = {}
        for p61,p39 in self.phon61_map39.items():
            if not p39 in self.phon39_map61:
                self.phon39_map61[p39] = []
            self.phon39_map61[p39].append(p61)
            
    
    def get39EquiOf61(self,p):
        return self.phon61_map39[self.removePhonStressMarker(p)]

    def removePhonStressMarker(self,phon):
        phon = phon.replace('1','')
        phon = phon.replace('2','')
        return phon
    
    def getWindow(self,sr):
        nfft = 512
        winlen = self._winlen * sr
        winstep = self._winstep * sr
        return nfft,int(winlen),int(winstep)

    def singleTrainingFrameSize(self,sr):
        return math.floor(sr/4)
        
    def readTrainingDataDescriptionCSV(self):
        file_path = self.__main_directory + 'train_data.csv' #check if train_data.csv is in correct path
        self._Tdd = pd.read_csv(file_path)
        # removing NaN entries in the train_data.csv file
        dr = ['DR1','DR2','DR3','DR4','DR5','DR6','DR7','DR8']
        self._Tdd = self._Tdd[self._Tdd['dialect_region'].isin(dr)]
        return self._Tdd

    def readTestingDataDescriptionCSV(self):
        file_path = self.__main_directory + 'test_data.csv' #check if train_data.csv is in correct path
        self._tdd = pd.read_csv(file_path)
        # removing NaN entries in the train_data.csv file
        dr = ['DR1','DR2','DR3','DR4','DR5','DR6','DR7','DR8']
        self._tdd = self._tdd[self._tdd['dialect_region'].isin(dr)]
        return self._tdd
    
    def getListAudioFiles(self,of='Train'):
        if of == 'Train':
            self.readTrainingDataDescriptionCSV()
            return self._Tdd[self._Tdd[self.f_IsAudio] == True]
        if of == 'Test':
            self.readTestingDataDescriptionCSV()
            return self._tdd[self._tdd[self.f_IsAudio] == True]
        
    def getListPhonemeFiles(self,of='Train'):
        if of == 'Train':
            self.readTrainingDataDescriptionCSV()
            return self._Tdd[self._Tdd[self.f_IsPhon] == True]
        if of == 'Test':
            self.readTestingDataDescriptionCSV()
            return self._tdd[self._tdd[self.f_IsPhon] == True]
               
    def readAudio(self,fpath=None,pre_emp = False):
        if(fpath == None):
            return np.zeros(1),0
        
        fpath = self.__data_directory+fpath
        if os.path.exists(fpath):
            S,sr = librosa.load(fpath,sr=None)
            if pre_emp:
                S = librosa.effects.preemphasis(S)
            return S,sr   
        else:
            return np.zeros(1),0
    
    
    def readPhon(self,fpath=None):
        if(fpath == None):
            raise Exception('phon file path not provided')
        
        fpath = self.__data_directory+fpath
        ph_ = pd.read_csv(fpath,sep=" ")#,usecols=['start','end','phoneme'])
        #ph_.columns = ['start','end','phoneme']
        return ph_
            
        pfn = j['filename'].split('.WAV')[0]+'.PHN'
        p_bar.set_description(f'Working on {j["filename"]} ,index: {c}  ')
        try:
            pfp = file_path+pfd[(pfd['filename']==pfn) & (pfd['speaker_id'] == j['speaker_id'])][f_Path].values[0]
        except:
            pfp = afp.replace(j['filename'],pfn)
            
        ph_ = pd.read_csv(pfp,sep=" ")#,usecols=['start','end','phoneme'])
        #ph_.columns = ['start','end','phoneme']
    #---------------end readPhon()
        
    def getFeatureAndLabel(self,ftype='mfcc',audio_path=None,phon_path=None,n_mels=128,delta=False,delta_delta=False):
        if audio_path == None:
            raise Exception("Path to audio (Wav) file must be provided")
        wav,sr = self.readAudio(fpath=audio_path,pre_emp=True)
        nfft,winlen,winstep = self.getWindow(sr)
        if(ftype == 'mfcc'):
            melspec = librosa.feature.mfcc(wav,sr=sr,hop_length=winstep,win_length=winlen,n_fft=nfft,n_mfcc=n_mels)
            
        db_melspec = librosa.amplitude_to_db(melspec,ref=np.max)
        
        mD = None
        mDD = None
        if(delta):
            mD = librosa.feature.delta(db_melspec)
            if(delta_delta):
                mDD = librosa.feature.delta(mD)
        
        audio_phon_transcription = None
        if phon_path == None:
            tmp = audio_path.split('/')
            phon_path = "/".join(tmp[:(len(tmp)-1)])+"/"+ tmp[len(tmp)-1].split('.WAV')[0]+".PHN"
            
        audio_phon_transcription = self.readPhon(phon_path)            
        time = db_melspec.shape[1]
        
        feature_vectors = []
        db_melspec = db_melspec.T
        mD = mD.T
        mDD = mDD.T
        
        prev = None
        first = audio_phon_transcription.columns
        audio_phon_transcription.columns = ['start','end','phoneme']
        labels = []
        for i in range(time):
            #---collecting feature---
            feature = np.zeros(n_mels*3)
            feature[:n_mels] = db_melspec[i]
            feature[n_mels:n_mels*2] = mD[i]
            feature[n_mels*2:n_mels*3] = mDD[i]
            feature_vectors.append(feature)
            
            #---collecting phoneme label ---
            start = winstep * i
            end = start+winlen
            diff = start+400
            phoneme = list(
                        audio_phon_transcription[
                            ((audio_phon_transcription['start']<=start) & 
                            ((audio_phon_transcription['end']-start)>=int(winlen/1.5)))
                            |
                            ((audio_phon_transcription['start']<=end) & 
                                (audio_phon_transcription['end']>end))  
                        ].to_dict()['phoneme'].values()
            )
            if len(phoneme) == 0:
                if int(first[1]) > start:
                    phoneme = first[2]
                else:
                    phoneme = prev
            else:
                phoneme = phoneme[0]
            phoneme = self.get39EquiOf61(phoneme)
            prev = phoneme
            labels.append(phoneme)
             
        return feature_vectors,labels
    
    
    def prepareLabelsForTraining(self,labels):
        print('Preparing Labels')
        label_vector = []
        p_bar = tqdm(range(len(labels)))
        c = 0
        for l in labels:
            label = [0 for i in range(39)]
            label[self.label_p39[l]-1] = 1
            label_vector.append(label)
            c+=1
            if c == 500:
                p_bar.set_description(f'Working on phoneme {l}')
                p_bar.update(c)
                c = 0
           
        p_bar.set_description(f'Working on phoneme {l}')
        p_bar.update(c) 
        return label_vector
    
    def collectFeatures(self,ft='Train',ftype='mfcc',n_mels=128,delta=False,delta_delta=False):
        tddA = self.getListAudioFiles(ft)
        tddA.index = range(tddA.shape[0])
        feature_vectors = []
        labels = []
        
        p_bar = tqdm(range(tddA.shape[0]))
        silent_count = 0
        for i in range(tddA.shape[0]):
            fv,lv = self.getFeatureAndLabel(ftype=ftype,audio_path=tddA.loc[i][self.f_Path],n_mels=n_mels,delta=delta,delta_delta=delta_delta)
            p_bar.set_description(f'Working on {tddA.loc[i][self.f_Path]} ,index: {i}  ')
            p_bar.update()
            feature_vectors += fv
            labels += lv
                   
        print(f"length of feature_vectors is {len(feature_vectors)} and length of labels is {len(labels)}")
        labels = np.asarray(np.array(self.prepareLabelsForTraining(labels),dtype=object)).astype(np.int16)
        feature_vectors = np.asarray(np.array(feature_vectors,dtype=object)).astype(np.float32)
        return feature_vectors,labels        

In [5]:
####--------------Collecting Training Features----------------------###   
gc.collect()
cm = DNN_MODULE_BUILDER(path)
n_mels = 64
delta = True
delta_delta=True
ftype = 'mfcc'

print('Collecting Features from Audio Files')
features,labels = cm.collectFeatures(ftype=ftype,n_mels=n_mels,delta=delta,delta_delta=delta_delta)

print('--- Completed')

gc.collect()

Collecting Features from Audio Files


  0%|          | 0/4620 [00:00<?, ?it/s]

length of feature_vectors is 1421707 and length of labels is 1421707
Preparing Labels


  0%|          | 0/1421707 [00:00<?, ?it/s]

--- Completed


0

In [6]:
#-------------
ffp = open("/kaggle/working/features.pkl",'wb')
pkl.dump(features,ffp)
flp = open("/kaggle/working/labels.pkl",'wb')
pkl.dump(labels,flp)            
ffp.close()
flp.close()

In [11]:
print(features.shape)
print(labels.shape)

(1421707, 192)
(1421707, 39)


In [12]:
####--------------Model Training----------------------###   
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=1024, input_shape=[n_mels*3],activation=tf.nn.relu),
    tf.keras.layers.Dense(units=1024,activation=tf.nn.relu),
    tf.keras.layers.Dense(units=1024,activation=tf.nn.relu),
    tf.keras.layers.Dense(units=39,activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

gc.collect()

2021-12-13 00:12:16.326118: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-13 00:12:16.365801: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2021-12-13 00:12:16.365862: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-13 00:12:16.365947: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (9ccea6c355dc): /proc/driver/nvidia/version does not exist
2021-12-13 00:12:16.369291: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              197632    
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_3 (Dense)              (None, 39)                39975     
Total params: 2,336,807
Trainable params: 2,336,807
Non-trainable params: 0
_________________________________________________________________


0

In [14]:
history = model.fit(
    features[:1137000],labels[:1137000],epochs=25,
     batch_size=512, verbose=1,
    validation_data=(features[1137000:],labels[1137000:]),
    validation_batch_size=128,
)

2021-12-13 00:12:29.577035: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 873216000 exceeds 10% of free system memory.
2021-12-13 00:12:30.511860: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-12-13 00:12:30.525805: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [7]:
###------------collecting test features -------------------
gc.collect()
test_features,test_labels = cm.collectFeatures(ft='Test',ftype=ftype,n_mels=n_mels,delta=delta,delta_delta=delta_delta)
gc.collect()

  0%|          | 0/1680 [00:00<?, ?it/s]

length of feature_vectors is 519525 and length of labels is 519525
Preparing Labels


  0%|          | 0/519525 [00:00<?, ?it/s]

0

In [15]:
####--------------Model Evaluating----------------------###   
evaluation = model.evaluate(test_features,test_labels,batch_size=128)

