In [1]:
import numpy as np 
import librosa 
import matplotlib.pyplot as plt 
import IPython
import spectrum
import os

In [2]:
def listen(file, sr = 22050):
    return IPython.display.Audio(file, rate = sr)


In [3]:
audio, sr = librosa.load('./Dataset/Database for Speaker Recognition-20 spkr-TIMIT/train/F(1)/F(1010).wav')

In [4]:
listen(audio, sr)

### VAD ### 

In [5]:
segLen,frameRate,numMix = 3,50,128
def energy(audio):
    # uses the librosa library to compute short-term energy
    ste = librosa.feature.rms(audio,hop_length=1024).T
    thresh = 0.1*(np.percentile(ste,97.5) + 9*np.percentile(ste,2.5))    # Trim 5% off and set threshold as 0.1x of the ste range
    x = (ste>thresh).astype('bool')
    return x

def framming(audio):
    number = librosa.util.frame(audio, frame_length = 2048, hop_length = 1024).T
    return number

def remove_silence(x,number):
    voice = []
    for i,v in enumerate (x):
        if v == True:
            #voice.append(i)
            voice.extend(number[i])
    return voice

def vad(audio):
    x = []
    x = energy(audio)
    frames = framming(audio)
    voice = remove_silence(x,frames)
    return voice

In [6]:
y = vad(audio)

In [7]:
listen(y,sr)

### LPC and MFCC ### 

In [8]:
def feature_exctraction(audio): 
    lpc = spectrum.lpc(audio, 10)
    mfcc = librosa.feature.mfcc(audio, sr = sr, n_mfcc = 13)
    delta_1 = librosa.feature.delta(mfcc, order = 1)
    delta_2 = librosa.feature.delta(mfcc, order = 2)
    final_mfcc = np.concatenate((mfcc,delta_1,delta_2), axis = 0)
    return (final_mfcc, lpc) 

In [9]:
x = feature_exctraction(audio)
print(x[0].shape)
print(type(x[0]))

(39, 171)
<class 'numpy.ndarray'>


### Get path of files ### 

In [14]:
def get_filepaths(directory):  
    file_paths = []  
    for root, directories, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  

    return file_paths 
path = "./Dataset/Database for Speaker Recognition-20 spkr-TIMIT/train"
training_files = get_filepaths(path)

In [11]:
training_files.__len__()

140

In [12]:
testing_files = get_filepaths("./Dataset/Database for Speaker Recognition-20 spkr-TIMIT/test")
testing_files.__len__()

60

### Training a GMM ### 

In [13]:
from sklearn.mixture import GaussianMixture

In [401]:
model = GaussianMixture(n_components = 32, covariance_type = 'diag' , verbose = 1, reg_covar = 7.3390961e-01) 
# fit the model 
model.fit(x[0])

Initialization 0
Initialization converged: True


GaussianMixture(covariance_type='diag', n_components=32, reg_covar=0.73390961,
                verbose=1)

In [395]:
x[0]

array([[-6.6177435e+02, -6.5714014e+02, -6.3939355e+02, ...,
        -6.0668573e+02, -6.2780914e+02, -6.3468384e+02],
       [ 4.0866790e+00,  9.1505661e+00,  2.4921623e+01, ...,
         4.4533257e+01,  3.2892204e+01,  3.1198538e+01],
       [ 3.9115591e+00,  5.5146565e+00,  1.2664882e+00, ...,
        -1.1406961e+01,  2.3442328e+00,  1.0434496e+01],
       ...,
       [ 3.5855359e-01,  3.5855359e-01,  3.5855359e-01, ...,
        -7.3390961e-01, -7.3390961e-01, -7.3390961e-01],
       [ 3.3223963e-01,  3.3223963e-01,  3.3223963e-01, ...,
        -1.2716857e+00, -1.2716857e+00, -1.2716857e+00],
       [-1.4997593e-01, -1.4997593e-01, -1.4997593e-01, ...,
        -1.2831343e+00, -1.2831343e+00, -1.2831343e+00]], dtype=float32)

In [403]:
matrix = model.means_

In [404]:
matrix.shape

(32, 171)

In [433]:
test, sr  = librosa.load('./Dataset/Database for Speaker Recognition-20 spkr-TIMIT/test/F(5)/F(1057).wav')
test = vad(test)
test = np.asarray(test)
mfcc = feature_exctraction(test)[0]

In [432]:
listen(test, sr = sr)

In [452]:
model.predict(z)

array([ 1, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29], dtype=int64)

In [435]:
mfcc.shape

(39, 133)

In [446]:
arr = np.zeros(shape = (39,171-133))