In [1]:
import numpy as np
import librosa
from pydub import AudioSegment
from pydub.utils import mediainfo
from sklearn import preprocessing
def mfcc_extraction(audio_filename, #.wav filename
 hop_duration, #hop_length in seconds, e.g., 0.015s (i.e., 15ms)
 num_mfcc #number of mfcc features
 ):
    speech = AudioSegment.from_wav(audio_filename) #Read audio data from file
    samples = speech.get_array_of_samples() #samples x(t)

    sampling_rate = speech.frame_rate #sampling rate f

    mfcc = librosa.feature.mfcc(
    np.float32(samples),
    sr = sampling_rate,
    hop_length = int(sampling_rate * hop_duration),
    n_mfcc = num_mfcc)

    return mfcc.T

In [2]:
from sklearn.mixture import GaussianMixture
def learningGMM(features, #list of feature vectors, each feature vector is an array
 n_components, #the number of components
 max_iter #maximum number of iterations
 ):
    gmm = GaussianMixture(n_components = n_components, max_iter = max_iter)
    gmm.fit(features)
    return gmm

In [3]:
import os
path = 'SpeakerData/'
speakers = os.listdir(path + 'Train/')
print(speakers)

['Anthony', 'Azmisov', 'Bachroxx', 'Arthur', 'Bahoke', 'Artem', 'BelmontGuy', 'AppleEater', 'Bareford', 'Asladic', 'Bassel', 'Argail', 'Ariyan', 'Asalkeld', 'Arvala', 'Artk', 'Beady', 'Arjuan', 'Bart', 'Asp', 'Beez', 'Ara', 'Bae', 'Arun', 'B']


In [63]:
from sklearn import preprocessing
#this list is used to store the MFCC features of all training data of all speakers
mfcc_all_speakers = []
hop_duration = 0.015 #15ms
num_mfcc = 14
for s in speakers:
    sub_path = path + 'Train/' + s + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    mfcc_one_speaker = np.asarray(())
    for fn in sub_file_names:
        mfcc_one_file = mfcc_extraction(fn, hop_duration, num_mfcc)
        if mfcc_one_speaker.size == 0:
            mfcc_one_speaker = mfcc_one_file
        else:
            mfcc_one_speaker = np.vstack((mfcc_one_speaker, mfcc_one_file))
            mfcc_all_speakers.append(mfcc_one_speaker)


In [64]:
n_components = 5
max_iter = 50
gmms = [] #list of GMMs, each is for a speaker
for i in range(0, len(speakers)):
    gmm = learningGMM(mfcc_all_speakers[i],
    n_components,
    max_iter)
    gmms.append(gmm)

In [65]:
def speaker_recognition(audio_file_name, gmms):
    #print("File: ", audio_file_name)
    speaker_id = 0
    f = mfcc_extraction(audio_file_name, 0.015, 14)
    max = gmms[0].score(f)

    for i in range(0, len(gmms)):
        s = gmms[i].score(f)
        #print(s)
        if(s >= max):
            max = s
            speaker_id = i
            #print(max, speaker_id)
    return speaker_id

In [66]:
speaker_id = speaker_recognition('SpeakerData/Test/Argail/rb-17.wav', gmms)
print("Speaker: ", speakers[speaker_id])

Speaker:  Arun


In [67]:
path = 'SpeakerData/'
speakers = os.listdir(path + 'Test/')

test_file_names = []
test_speaker_labels = []
for i in range(0, len(speakers)):
    sub_path = path + 'Test/' + speakers[i] + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    sub_speaker_labels = [i] * len(sub_file_names) #create a list of N elements, all are i
    test_file_names += sub_file_names
    test_speaker_labels += sub_speaker_labels

print(test_file_names)
print(test_speaker_labels)




['SpeakerData/Test/Anthony/a0495.wav', 'SpeakerData/Test/Anthony/a0500.wav', 'SpeakerData/Test/Anthony/a0496.wav', 'SpeakerData/Test/Anthony/a0499.wav', 'SpeakerData/Test/Anthony/a0494.wav', 'SpeakerData/Test/Anthony/a0498.wav', 'SpeakerData/Test/Anthony/a0497.wav', 'SpeakerData/Test/Azmisov/a0008.wav', 'SpeakerData/Test/Azmisov/a0006.wav', 'SpeakerData/Test/Azmisov/a0009.wav', 'SpeakerData/Test/Azmisov/a0012.wav', 'SpeakerData/Test/Azmisov/a0007.wav', 'SpeakerData/Test/Azmisov/a0010.wav', 'SpeakerData/Test/Azmisov/a0011.wav', 'SpeakerData/Test/Bachroxx/b0113.wav', 'SpeakerData/Test/Bachroxx/b0110.wav', 'SpeakerData/Test/Bachroxx/b0114.wav', 'SpeakerData/Test/Bachroxx/b0111.wav', 'SpeakerData/Test/Bachroxx/b0116.wav', 'SpeakerData/Test/Bachroxx/b0112.wav', 'SpeakerData/Test/Bachroxx/b0115.wav', 'SpeakerData/Test/Arthur/rp-29.wav', 'SpeakerData/Test/Arthur/ar-04.wav', 'SpeakerData/Test/Arthur/rp-30.wav', 'SpeakerData/Test/Arthur/ar-06.wav', 'SpeakerData/Test/Arthur/rp-28.wav', 'SpeakerD

In [68]:
pred = []
for i in test_file_names:
    pred.append(speaker_recognition(i, gmms))


In [69]:
sum(1 for x,y in zip(test_speaker_labels, pred) if x == y) / float(len(test_speaker_labels))


0.0