Change paths as appropriate, all files are in the repo. The dataset is shared at below link,

https://drive.google.com/drive/folders/1cztOSY24ndvBhcmjuJaPuMSIdgy04Rjg?usp=sharing

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle as pkl
import gc
import librosa
from librosa import display
import math
from tqdm.notebook import tqdm, trange
from hmmlearn.hmm import GMMHMM

path = 'C:/Users/shukl/Documents/Georgia-Tech/MUSI-6201/Project on Music Segmentation/TIMIT-GMMHMM/Dataset'
data_path = path+"data/"

In [2]:
class FeatureCollection():
    __train_desc = 'train_data.csv'
    __test_desc = 'test_data.csv'
    __data_directory = './data'
    __main_directory = './'
    
    # path to csv files
    f_Path = 'path_from_data_dir' 
    
    isAudio = 'is_converted_audio'  #boolean field that tells that the record in train_data.csv contains the description of audio file we are interested in
    isPhon = 'is_phonetic_file'
    
    lengthOfWindow = 0.025
    stepWindow = 0.01
    
    
    def __init__(self,path=None):
        
        self.__main_directory = path
        
        if path[len(path)-1] == '/':
            self.__data_directory = path+"data/"
        else:
            self.__main_directory += "/"
            self.__data_directory = self.__main_directory+"data/"
      
        # Mapping 61 phoneme to 39 phonemes
        self.phon61_map39 = {
            'iy':'iy',  'ih':'ih',   'eh':'eh',  'ae':'ae',    'ix':'ih',  'ax':'ah',   'ah':'ah',  'uw':'uw',
            'ux':'uw',  'uh':'uh',   'ao':'aa',  'aa':'aa',    'ey':'ey',  'ay':'ay',   'oy':'oy',  'aw':'aw',
            'ow':'ow',  'l':'l',     'el':'l',  'r':'r',      'y':'y',    'w':'w',     'er':'er',  'axr':'er',
            'm':'m',    'em':'m',     'n':'n',    'nx':'n',     'en':'n',  'ng':'ng',   'eng':'ng', 'ch':'ch',
            'jh':'jh',  'dh':'dh',   'b':'b',    'd':'d',      'dx':'dx',  'g':'g',     'p':'p',    't':'t',
            'k':'k',    'z':'z',     'zh':'sh',  'v':'v',      'f':'f',    'th':'th',   's':'s',    'sh':'sh',
            'hh':'hh',  'hv':'hh',   'pcl':'h#', 'tcl':'h#', 'kcl':'h#', 'qcl':'h#','bcl':'h#','dcl':'h#',
            'gcl':'h#','h#':'h#',  '#h':'h#',  'pau':'h#', 'epi': 'h#','nx':'n',   'ax-h':'ah','q':'h#' 
        }
        
        self.phon_61 = list(self.phon61_map39.keys())
        self.phon_39 = list(set(self.phon61_map39.values()))
        self.phon_61.sort()
        self.phon_39.sort()

        self.label_p39 = {}
        self.p39_label = {}
        for idx,phon in enumerate(self.phon_39):
            self.label_p39[phon] = idx
            self.p39_label[idx] = phon

        self.phon39_map61 = {}
        for p61,p39 in self.phon61_map39.items():
            if not p39 in self.phon39_map61:
                self.phon39_map61[p39] = []
            self.phon39_map61[p39].append(p61)
        
        pkl.dump(self.label_p39,open('/timit-gmmhmm/phon_label_index.pkl','wb'))
        pkl.dump(self.phon61_map39,open('/timit-gmmhmm/phon_map_61To39.pkl','wb'))
        
        
    
    def get39EquiOf61(self,p):
        return self.phon61_map39[self.removePhonStressMarker(p)]
    
    
    def get39Index(self,phon):
        return self.label_p39[phon]
    

    def get39Phon(self,index):
        return self.p39_label[index] 

    
    def removePhonStressMarker(self,phon):
        phon = phon.replace('1','')
        phon = phon.replace('2','')
        return phon
    
    
    def getWindow(self,sr):
        nfft = 512
        winlen = self.lengthOfWindow * sr
        winstep = self.stepWindow * sr
        return nfft,int(winlen),int(winstep)
    
        
    def readTrainingDataDescriptionCSV(self):
        file_path = self.__main_directory + 'train_data.csv' #check if train_data.csv is in correct path
        # Tdd -> training data descriptions
        self._Tdd = pd.read_csv(file_path)
        # removing NaN entries in the train_data.csv file
        dr = ['DR1','DR2','DR3','DR4','DR5','DR6','DR7','DR8']
        self._Tdd = self._Tdd[self._Tdd['dialect_region'].isin(dr)]
        return self._Tdd

    
    def readTestingDataDescriptionCSV(self):
        file_path = self.__main_directory + 'test_data.csv' #check if train_data.csv is in correct path
        # tdd -> testing data descriptions
        self._tdd = pd.read_csv(file_path)
        # removing NaN entries in the train_data.csv file
        dr = ['DR1','DR2','DR3','DR4','DR5','DR6','DR7','DR8']
        self._tdd = self._tdd[self._tdd['dialect_region'].isin(dr)]
        return self._tdd
    
    
    def getListAudioFiles(self,of='Train'):
        if of == 'Train':
            self.readTrainingDataDescriptionCSV()
            return self._Tdd[self._Tdd[self.isAudio] == True]
        if of == 'Test':
            self.readTestingDataDescriptionCSV()
            return self._tdd[self._tdd[self.isAudio] == True]
        
        
    def getListPhonemeFiles(self,of='Train'):
        if of == 'Train':
            self.readTrainingDataDescriptionCSV()
            return self._Tdd[self._Tdd[self.isPhon] == True]
        if of == 'Test':
            self.readTestingDataDescriptionCSV()
            return self._tdd[self._tdd[self.isPhon] == True]
        
               
    def readAudio(self,fpath=None,pre_emp = False):
        if(fpath == None):
            return np.zeros(1),0
        
        fpath = self.__data_directory+fpath
        if os.path.exists(fpath):
            S,sr = librosa.load(fpath,sr=None)
            if pre_emp:
                S = librosa.effects.preemphasis(S)
            return S,sr   
        else:
            return np.zeros(1),0
        
    
    def getPhonPathFromAudioPath(self,audio_path):
        return audio_path.split(".WAV")[0]+".PHN"
    
    
    def readPhon(self,fpath=None):    
        if(fpath == None):
            raise Exception('phon file path not provided')
            

        fpath = data_path+fpath
        ph_ = pd.read_csv(fpath,sep=" ")
        first = ph_.columns
        ph_.columns = ['start','end','phoneme']
        ph_.loc[-1] = [int(first[0]),int(first[1]),first[2]]
        ph_ = ph_.sort_index()
        ph_.index = range(ph_.index.shape[0])
        return ph_
        
        
    def getFeatureAndLabel(self,ftype='mfcc',audio_path=None,phon_path=None,n_mels=128,delta=False,delta_delta=False):
        if audio_path == None:
            raise Exception("Path to audio (Wav) file must be provided")
        wav,sr = self.readAudio(fpath=audio_path,pre_emp=True)
        nfft,winlen,winstep = self.getWindow(sr)
        if(ftype == 'mfcc'):
            db_melspec = librosa.feature.mfcc(wav,sr=sr,hop_length=winstep,win_length=winlen,n_fft=nfft,n_mfcc=n_mels)
            
        mD = None
        mDD = None
        if(delta):
            mD = librosa.feature.delta(db_melspec)
            db_melspec = np.concatenate([db_melspec,mD])
            if(delta_delta):
                mDD = librosa.feature.delta(mD)
                db_melspec = np.concatenate([db_melspec,mDD])
        
        audio_phon_transcription = None
        if phon_path == None:
            phon_path = self.getPhonPathFromAudioPath(audio_path)
            
        audio_phon_transcription = self.readPhon(phon_path)     
        
        feature_vectors = []
        db_melspec = db_melspec.T       
        time = db_melspec.shape[0]
        
        prev = None
        labels = []
        for i in range(time):
            #---collecting feature---
            feature_vectors.append(db_melspec[i])
            
            #---collecting phoneme label ---
            start = winstep * i
            end = start+winlen
            diff = start+400
            phoneme = list(
                        audio_phon_transcription[
                            ((audio_phon_transcription['start']<=start) & 
                            ((audio_phon_transcription['end']-start)>=int(winlen/2)))
                            |
                            ((audio_phon_transcription['start']<=end) & 
                                (audio_phon_transcription['end']>end))  
                        ].to_dict()['phoneme'].values()
            )
            tmp = phoneme
            try:
                phoneme = self.get39EquiOf61(phoneme[0])
                prev = phoneme
                labels.append(phoneme)
            except:
                # if the phoneme file doesn't start from Zero, then assign 'h#' silent to the frames in those unlabeled segments
                # or if the phoneme can't be determined as per logic above for a segment then assign silent 'h#'
                labels.append('h#')
             
        return feature_vectors,labels
                
        
    def prepareLabelsForTraining(self,labels):
        print('Preparing Labels')
        label_vector = []
        p_bar = tqdm(range(len(labels)))
        c = 0
        for l in labels:
            label = [0 for i in range(39)]
            label[self.label_p39[l]-1] = 1
            label_vector.append(label)
            c+=1
            if c == 500:
                p_bar.set_description(f'Working on phoneme {l}')
                p_bar.update(c)
                c = 0
           
        p_bar.set_description(f'Working on phoneme {l}')
        p_bar.update(c) 
        return label_vector
    
        
    def collectHMMFeatures(self,ft='Train',ftype='mfcc',n_mels=128,delta=False,delta_delta=False):
        tddA = self.getListAudioFiles(ft)
        tddA.index = range(tddA.shape[0])
        feature_vectors = [[] for i in range(39)]
        lengths = [[] for i in range(39)]
        
        p_bar = tqdm(range(tddA.shape[0]))
        for i in range(tddA.shape[0]):
            p_bar.set_description(f'Working on {tddA.loc[i][self.f_Path]}')
            fv,lv = self.getFeatureAndLabel(ftype=ftype,audio_path=tddA.loc[i][self.f_Path],n_mels=n_mels,delta=delta,delta_delta=delta_delta)
            count = 0
            prev = ''
            for j in range(len(lv)):
                if prev == '' or lv[j]==prev:
                    prev = lv[j]
                    count+=1
                    feature_vectors[self.get39Index(prev)].append(fv[j])
                elif lv[j]!=prev:
                    lengths[self.get39Index(prev)].append(count)
                    prev = lv[j]
                    feature_vectors[self.get39Index(prev)].append(fv[j])
                    count = 1
                    
            #inserting last length
            lengths[self.get39Index(prev)].append(count)
            #-----------------
            p_bar.update()
            if(i%100==0):
                gc.collect()
           
        feature_vectors = [np.asarray(np.array(feature_vectors[i],dtype=object)).astype(np.float32) for i in range(39)]
        print(f"length of feature_vectors is {feature_vectors[0].shape}")
        return feature_vectors,lengths  

In [3]:
fc = FeatureCollection(path)
n_mels = 64
delta = False
delta_delta = False
ftype = 'mfcc'

print('Collecting Features from Audio Files')
features,lengths = fc.collectHMMFeatures(ftype=ftype,n_mels=n_mels,delta=delta,delta_delta=delta_delta)
# -------------
ffp = open("/timit-gmmhmm/features.pkl",'wb')
pkl.dump(features,ffp)
flp = open("/timit-gmmhmm/lengths.pkl",'wb')
pkl.dump(lengths,flp) 
ffp.close()
flp.close()
print('--- Completed')

gc.collect()

Attempting to read features file /kaggle/input/timit-gmmhmm-asr/
--- Failed
Collecting Features from Audio Files


  0%|          | 0/4620 [00:00<?, ?it/s]

length of feature_vectors is (74738, 64)
--- Completed


19

In [4]:
models = [GMMHMM(n_components=2,n_mix=1,verbose=False,n_iter=15) for i in range(39)]

p_bar = tqdm(range(39))

# train models
for i in range(39):
    p_bar.set_description('{}. Training "{}" Phoneme Model'.format(i,fc.get39Phon(i)))
    models[i].fit(features[i],lengths[i])
    p_bar.update()
gc.collect()

# save models
for i in range(39):
    filename = "Model{}-{}.pkl".format(i,fc.get39Phon(i))
    file_handle = open('/timit-gmmhmm/{}'.format(filename),'wb')
    pkl.dump(models[i],file_handle)
    file_handle.close()
gc.collect()

  0%|          | 0/39 [00:00<?, ?it/s]

0

In [5]:
test_features,test_lengths = fc.collectHMMFeatures(ft='Test',ftype=ftype,n_mels=n_mels,delta=delta,delta_delta=delta_delta)
pkl.dump(test_features,open('/timit-gmmhmm/test_features.pkl','wb'))
pkl.dump(test_lengths,open('/timit-gmmhmm/test_lengths.pkl','wb'))
gc.collect()   

  0%|          | 0/1680 [00:00<?, ?it/s]

length of feature_vectors is (28617, 64)


19

In [6]:
for i in range(39):
    # --- adding missing length at end
    tfeat_len = test_features[i].shape[0]
    tlen_len = np.sum(test_lengths[i])
    if tfeat_len != tlen_len:
        test_lengths[i].append(tfeat_len-tlen_len)

predictions = []
for i in range(39):
    #for each phon data
    count = 0
    s = 0
    p_bar = tqdm(range(len(test_lengths[i])))
    p_bar.set_description('{}. Testing Data of phoneme "{}" against all models'.format(i,fc.get39Phon(i)))
    
    for j in test_lengths[i]:
        # test in each phon model
        max_prediction = -999999999999
        max_index = 0
        t_feat = test_features[i][s:j+s]
        for k in range(39):
            try:
                score = math.floor(models[k].score(t_feat)*1000)
                if(score > max_prediction):
                    max_prediction = score
                    max_index = k
                if max_index > i:
                    break
            except:
                continue
                
        
        p_bar.update() 
        count+= 1 if max_index == i else 0      
        s=j
        
    predictions.append((count,len(test_lengths[i])))
    
    p_bar.set_description('{}. Testing Data of phoneme "{}" against all models \nResult: {}/{} correct prediction;\n accuracy: {:.2f}%'.format(
        i+1,fc.get39Phon(i),count,len(test_lengths[i]),(count/len(test_lengths[i]))*100)
    )
    print('{}. Testing Data of phoneme "{}" against all models \nResult: {}/{} correct prediction;\n accuracy: {:.2f}%'.format(
        i+1,fc.get39Phon(i),count,len(test_lengths[i]),(count/len(test_lengths[i]))*100)
    )

  0%|          | 0/2347 [00:00<?, ?it/s]

1. Testing Data of phoneme "aa" against all models 
Result: 469/2347 correct prediction;
 accuracy: 19.98%


  0%|          | 0/1433 [00:00<?, ?it/s]

2. Testing Data of phoneme "ae" against all models 
Result: 30/1433 correct prediction;
 accuracy: 2.09%


  0%|          | 0/2426 [00:00<?, ?it/s]

3. Testing Data of phoneme "ah" against all models 
Result: 384/2426 correct prediction;
 accuracy: 15.83%


  0%|          | 0/218 [00:00<?, ?it/s]

4. Testing Data of phoneme "aw" against all models 
Result: 184/218 correct prediction;
 accuracy: 84.40%


  0%|          | 0/870 [00:00<?, ?it/s]

5. Testing Data of phoneme "ay" against all models 
Result: 0/870 correct prediction;
 accuracy: 0.00%


  0%|          | 0/497 [00:00<?, ?it/s]

6. Testing Data of phoneme "b" against all models 
Result: 123/497 correct prediction;
 accuracy: 24.75%


  0%|          | 0/268 [00:00<?, ?it/s]

7. Testing Data of phoneme "ch" against all models 
Result: 211/268 correct prediction;
 accuracy: 78.73%


  0%|          | 0/998 [00:00<?, ?it/s]

8. Testing Data of phoneme "d" against all models 
Result: 6/998 correct prediction;
 accuracy: 0.60%


  0%|          | 0/1008 [00:00<?, ?it/s]

9. Testing Data of phoneme "dh" against all models 
Result: 835/1008 correct prediction;
 accuracy: 82.84%


  0%|          | 0/920 [00:00<?, ?it/s]

10. Testing Data of phoneme "dx" against all models 
Result: 623/920 correct prediction;
 accuracy: 67.72%


  0%|          | 0/1471 [00:00<?, ?it/s]

11. Testing Data of phoneme "eh" against all models 
Result: 297/1471 correct prediction;
 accuracy: 20.19%


  0%|          | 0/2229 [00:00<?, ?it/s]

12. Testing Data of phoneme "er" against all models 
Result: 1865/2229 correct prediction;
 accuracy: 83.67%


  0%|          | 0/824 [00:00<?, ?it/s]

13. Testing Data of phoneme "ey" against all models 
Result: 154/824 correct prediction;
 accuracy: 18.69%


  0%|          | 0/924 [00:00<?, ?it/s]

14. Testing Data of phoneme "f" against all models 
Result: 465/924 correct prediction;
 accuracy: 50.32%


  0%|          | 0/706 [00:00<?, ?it/s]

15. Testing Data of phoneme "g" against all models 
Result: 178/706 correct prediction;
 accuracy: 25.21%


  0%|          | 0/13422 [00:00<?, ?it/s]

16. Testing Data of phoneme "h#" against all models 
Result: 10129/13422 correct prediction;
 accuracy: 75.47%


  0%|          | 0/735 [00:00<?, ?it/s]

17. Testing Data of phoneme "hh" against all models 
Result: 620/735 correct prediction;
 accuracy: 84.35%


  0%|          | 0/4812 [00:00<?, ?it/s]

18. Testing Data of phoneme "ih" against all models 
Result: 4314/4812 correct prediction;
 accuracy: 89.65%


  0%|          | 0/2785 [00:00<?, ?it/s]

19. Testing Data of phoneme "iy" against all models 
Result: 586/2785 correct prediction;
 accuracy: 21.04%


  0%|          | 0/381 [00:00<?, ?it/s]

20. Testing Data of phoneme "jh" against all models 
Result: 229/381 correct prediction;
 accuracy: 60.10%


  0%|          | 0/1537 [00:00<?, ?it/s]

21. Testing Data of phoneme "k" against all models 
Result: 797/1537 correct prediction;
 accuracy: 51.85%


  0%|          | 0/2749 [00:00<?, ?it/s]

22. Testing Data of phoneme "l" against all models 
Result: 74/2749 correct prediction;
 accuracy: 2.69%


  0%|          | 0/1571 [00:00<?, ?it/s]

23. Testing Data of phoneme "m" against all models 
Result: 1295/1571 correct prediction;
 accuracy: 82.43%


  0%|          | 0/3121 [00:00<?, ?it/s]

24. Testing Data of phoneme "n" against all models 
Result: 5/3121 correct prediction;
 accuracy: 0.16%


  0%|          | 0/420 [00:00<?, ?it/s]

25. Testing Data of phoneme "ng" against all models 
Result: 173/420 correct prediction;
 accuracy: 41.19%


  0%|          | 0/814 [00:00<?, ?it/s]

26. Testing Data of phoneme "ow" against all models 
Result: 27/814 correct prediction;
 accuracy: 3.32%


  0%|          | 0/266 [00:00<?, ?it/s]

27. Testing Data of phoneme "oy" against all models 
Result: 3/266 correct prediction;
 accuracy: 1.13%


  0%|          | 0/888 [00:00<?, ?it/s]

28. Testing Data of phoneme "p" against all models 
Result: 703/888 correct prediction;
 accuracy: 79.17%


  0%|          | 0/2548 [00:00<?, ?it/s]

29. Testing Data of phoneme "r" against all models 
Result: 372/2548 correct prediction;
 accuracy: 14.60%


  0%|          | 0/2699 [00:00<?, ?it/s]

30. Testing Data of phoneme "s" against all models 
Result: 1760/2699 correct prediction;
 accuracy: 65.21%


  0%|          | 0/875 [00:00<?, ?it/s]

31. Testing Data of phoneme "sh" against all models 
Result: 870/875 correct prediction;
 accuracy: 99.43%


  0%|          | 0/1527 [00:00<?, ?it/s]

32. Testing Data of phoneme "t" against all models 
Result: 1428/1527 correct prediction;
 accuracy: 93.52%


  0%|          | 0/268 [00:00<?, ?it/s]

33. Testing Data of phoneme "th" against all models 
Result: 57/268 correct prediction;
 accuracy: 21.27%


  0%|          | 0/225 [00:00<?, ?it/s]

34. Testing Data of phoneme "uh" against all models 
Result: 1/225 correct prediction;
 accuracy: 0.44%


  0%|          | 0/756 [00:00<?, ?it/s]

35. Testing Data of phoneme "uw" against all models 
Result: 462/756 correct prediction;
 accuracy: 61.11%


  0%|          | 0/712 [00:00<?, ?it/s]

36. Testing Data of phoneme "v" against all models 
Result: 575/712 correct prediction;
 accuracy: 80.76%


  0%|          | 0/1244 [00:00<?, ?it/s]

37. Testing Data of phoneme "w" against all models 
Result: 1197/1244 correct prediction;
 accuracy: 96.22%


  0%|          | 0/653 [00:00<?, ?it/s]

38. Testing Data of phoneme "y" against all models 
Result: 62/653 correct prediction;
 accuracy: 9.49%


  0%|          | 0/1278 [00:00<?, ?it/s]

39. Testing Data of phoneme "z" against all models 
Result: 1010/1278 correct prediction;
 accuracy: 79.03%


In [7]:
t_correct = 0
t_length = 0
for x,y in predictions:
    t_correct+=x
    t_length+= y

print("Mean Accuracy is {:2f}%".format((np.sum([x/y for x,y in predictions])/39)*100))
print("Total Accuracy is {:2f}%".format((t_correct/t_length)*100))
print('Individual Accuracy:')
for i in range(39):
    x = predictions[i][0]
    y = predictions[i][1]
    print(f"Accuracy of \"{fc.get39Phon(i)}\" Model: ",(x/y)*100)

Mean Accuracy is 45.863348%
Total Accuracy is 51.356721%
Individual Accuracy:
Accuracy of "aa" Model:  19.98295696634001
Accuracy of "ae" Model:  2.09351011863224
Accuracy of "ah" Model:  15.828524319868094
Accuracy of "aw" Model:  84.40366972477065
Accuracy of "ay" Model:  0.0
Accuracy of "b" Model:  24.748490945674046
Accuracy of "ch" Model:  78.73134328358209
Accuracy of "d" Model:  0.6012024048096193
Accuracy of "dh" Model:  82.8373015873016
Accuracy of "dx" Model:  67.71739130434783
Accuracy of "eh" Model:  20.190346702923183
Accuracy of "er" Model:  83.66980708838044
Accuracy of "ey" Model:  18.689320388349515
Accuracy of "f" Model:  50.324675324675326
Accuracy of "g" Model:  25.21246458923513
Accuracy of "h#" Model:  75.46565340485769
Accuracy of "hh" Model:  84.35374149659864
Accuracy of "ih" Model:  89.6508728179551
Accuracy of "iy" Model:  21.04129263913824
Accuracy of "jh" Model:  60.10498687664042
Accuracy of "k" Model:  51.85426154847105
Accuracy of "l" Model:  2.691887959