In [1]:
import numpy as np
from sklearn import preprocessing
import python_speech_features as mfcc

In [6]:
#feature extraction mfcc & delta mfcc
def calculate_delta(array):
    """Calculate and returns the delta of given feature vector matrix"""

    rows,cols = array.shape
    deltas = np.zeros((rows,20))
    N = 2
    for i in range(rows):
        index = []
        j = 1
        while j <= N:
            if i-j < 0:
              first =0
            else:
              first = i-j
            if i+j > rows-1:
                second = rows-1
            else:
                second = i+j 
            index.append((second,first))
            j+=1
        deltas[i] = ( array[index[0][0]]-array[index[0][1]] + (2 * (array[index[1][0]]-array[index[1][1]])) ) / 10
        
        return deltas

In [7]:
def extract_features(audio,rate):
    """extract 20 dim mfcc features from an audio, performs CMS and combines 
    delta to make it 40 dim feature vector"""    
    
    mfcc_feature = mfcc.mfcc(audio,rate, 0.025, 0.01,20,nfft = 1200, appendEnergy = True)    
    mfcc_feature = preprocessing.scale(mfcc_feature)
    delta = calculate_delta(mfcc_feature)
    combined = np.hstack((mfcc_feature,delta)) 
    return combined

In [8]:
#training the model

In [9]:
import os.path

In [10]:
import _pickle as cPickle

In [11]:
from scipy.io.wavfile import read
from sklearn.mixture import GaussianMixture

In [14]:
import warnings
warnings.filterwarnings("ignore")
source   = "trainingData/"   

In [15]:
 
#path to training data
source   = "trainingData\\"  
 
#path where training speakers will be saved
dest = "Speakers_models\\"
train_file = "trainingDataPath.txt"
file_paths = open(train_file,'r')
 
count = 1
# Extracting features for each speaker (5 files per speakers)
features = np.asarray(())
for path in file_paths:
    path = path.strip()
    print (path)
 
    # read the audio
    sr,audio = read(source + path)
 
    # extract 40 dimensional MFCC & delta MFCC features
    vector   = extract_features(audio,sr)
 
    if features.size == 0:
        features = vector
    else:
        features = np.vstack((features, vector))
    # when features of 5 files of speaker are concatenated, then do model training
    if count == 3:
        gmm = GaussianMixture(n_components = 16, max_iter = 200, covariance_type='diag',n_init = 3)
        gmm.fit(features)
 
        # dumping the trained gaussian model
        picklefile = path.split("-")[0]+".gmm"
        cPickle.dump(gmm,open(dest + picklefile,'wb'))
        print (' modeling completed for speaker={0} with data point ={1} ', picklefile,features.shape)
        features = np.asarray(())
        count = 0
    count = count + 1

darren-004\en-0920.wav
darren-004\en-0921.wav
darren-004\en-0922.wav
 modeling completed for speaker={0} with data point ={1}  darren.gmm (1696, 40)
david-003\en-0387.wav
david-003\en-0388.wav
david-003\en-0389.wav
 modeling completed for speaker={0} with data point ={1}  david.gmm (1809, 40)
frank-005\en-0632.wav
frank-005\en-0633.wav
frank-005\en-0634.wav
 modeling completed for speaker={0} with data point ={1}  frank.gmm (1911, 40)
John-001\en-1167.wav
John-001\en-1168.wav
John-001\en-1169.wav
 modeling completed for speaker={0} with data point ={1}  John.gmm (1496, 40)
rocky-002\en-0717.wav
rocky-002\en-0718.wav
rocky-002\en-0719.wav
 modeling completed for speaker={0} with data point ={1}  rocky.gmm (1396, 40)


In [18]:
#testing the model

In [16]:
import os
import time
#path to testing data
source   = "SampleData\\"
modelpath = "Speakers_models\\"
test_file = "testSamplePath.txt"
file_paths = open(test_file,'r')
 
gmm_files = [os.path.join(modelpath,fname) for fname in
              os.listdir(modelpath) if fname.endswith('.gmm')]
 
#Load the Gaussian gender Models
models    = [cPickle.load(open(fname,'rb')) for fname in gmm_files]
speakers   = [fname.split("\\")[-1].split(".gmm")[0] for fname
              in gmm_files]
 
# Read the test directory and get the list of test audio files
for path in file_paths:   
    path = path.strip()
    print (path)
    sr,audio = read(source + path)
    vector   = extract_features(audio,sr)
 
    log_likelihood = np.zeros(len(models)) 
 
    for i in range(len(models)):
        gmm    = models[i]  #checking with each model one by one
        scores = np.array(gmm.score(vector))
        log_likelihood[i] = scores.sum()
 
    winner = np.argmax(log_likelihood)
    print ("\tdetected as - ", speakers[winner])
    time.sleep(1.0)

darren_01.wav
	detected as -  darren
david_02.wav
	detected as -  david
frank_03.wav
	detected as -  frank
John_04.wav
	detected as -  frank
rocky_05.wav
	detected as -  rocky
