In [1]:
# Import the AudioSegment class for processing audio and the 
# split_on_silence function for separating out silent chunks.
from pydub import AudioSegment 
from pydub.silence import split_on_silence
import numpy as np, matplotlib.pyplot as plot, librosa, librosa.display, sklearn, sys
from sklearn.mixture import GaussianMixture as GMM
import os, _pickle as cPickle, warnings
warnings.filterwarnings("ignore")

In [2]:
#normalize a chunk to a target amplitude.
def match_target_amplitude(aChunk, target_dBFS):
    ''' Normalize given audio chunk '''
    change_in_dBFS = target_dBFS - aChunk.dBFS
    return aChunk.apply_gain(change_in_dBFS)

#silence removal, normalization and trimming
def remove_silence(path):
    # Load your audio.
    song = AudioSegment.from_file(path)
#     final_chunk=AudioSegment.empty()
#     # Split track where the silence is 3 seconds or more and get chunks using 
#     # the imported function.
#     chunks = split_on_silence (
#         # Use the loaded audio.
#         song, 
#         # Specify that a silent chunk must be at least 3 seconds or 3000 ms long.
#         min_silence_len = 3000,
#         # Consider a chunk silent if it's quieter than -16 dBFS.
#         # (You may want to adjust this parameter.)
#         silence_thresh = -30
#     )
    
#     # Process each chunk with your parameters
#     for i, chunk in enumerate(chunks):

#         # Normalize the entire chunk.
#         normalized_chunk = match_target_amplitude(chunk, -20.0)

#         #final chunk made by joining all non silent chunks
#         final_chunk+=normalized_chunk
    
#     # trimming the song, taking only 60 sec from beginning
#     final_song = final_chunk[:60000]

#     print("song arr length : ", final_song.__len__())
    #convert the song into numpy array
    song_array = song.get_array_of_samples()
    song_array = np.array(song_array)
    return song_array

In [3]:
#extracting mfccs and scaling them
def scaled_mfccs(song_array):
    song_array = song_array.astype(float)
    mfccs = librosa.feature.mfcc(song_array,n_mfcc=20)
    
    #scaling the MFCCs such that each coefficient dimension has zero mean and unit variance
    mfccs = sklearn.preprocessing.scale(mfccs,axis =1)
    return mfccs

In [24]:
#training dataset location text file
location = "audio_files/training/"
dest = "trained_models/"
#24 songs of 10 sec per singer, each has mfccs of size 20x?, we stack them vertically
features = np.empty([480, 431])
count = 1 
i = 0
with open("train.txt", "r") as training_file:
    for path in training_file:
        #remove leading and trailing spaces
        path = path.strip()
        song_array = remove_silence(location+path)
        mfccs = scaled_mfccs(song_array)
#         np.set_printoptions(threshold=sys.maxsize)
        
        if(count <= 24):
            features[i:i+20, :] = mfccs
            i = i+20
        if(count == 24):  
            gmm =  GMM(n_components=1).fit(features)
        
            #dump the results in pickle file
            picklefile = path.split("_")[0]+".gmm"
            cPickle.dump(gmm,open(dest + picklefile,'wb'))
            print('+ modeling completed for speaker:',picklefile," with data point = ",features.shape)
            
            count = 0
            i = 0
        count = count+1    

+ modeling completed for speaker: 32373500.gmm  with data point =  (480, 431)
+ modeling completed for speaker: 32449093.gmm  with data point =  (480, 431)
+ modeling completed for speaker: 36323632.gmm  with data point =  (480, 431)
+ modeling completed for speaker: 497880111.gmm  with data point =  (480, 431)
+ modeling completed for speaker: 498270772.gmm  with data point =  (480, 431)


In [26]:
#path to test data 
test_location = "audio_files/training/"

#path to trained models
modelpath = "trained_models/"

#get a list of path of all the GMM model files 
gmm_files = [os.path.join(modelpath,file) for file in
              os.listdir(modelpath) if file.endswith('.gmm')]

#load the models from GMM files
#models    = [cPickle.load(open(file,'rb')) for file in gmm_files]

#extract the id of the speaker corresponding to each GMM model
speakers  = [file.split("/")[-1].split(".gmm")[0] for file
              in gmm_files]


with open("train.txt","r") as test_paths:
    for path in test_paths:
        path = path.strip()
        song_array = remove_silence(test_location+path)
        mfccs = scaled_mfccs(song_array)
        
        #create an empty array to store the log-likelihood corresponding to each model
        log_likelihood = np.zeros(len(gmm_files)) 
         
        for i in range(len(gmm_files)):
            file = gmm_files[i]
            gmm = cPickle.load(open(file,'rb'))  #checking with each model one by one
            scores = np.array(gmm.score(mfccs))
            log_likelihood[i] = scores.sum()
        
        #getting the index of the model giving the maximum likelihood value
        winner = np.argmax(log_likelihood)
        print ("\tdetected as - ", speakers[winner])

	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32373500
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093
	detected as -  32449093


In [None]:
8