In [79]:
# Import the AudioSegment class for processing audio and the 
# split_on_silence function for separating out silent chunks.
from pydub import AudioSegment 
from pydub.silence import split_on_silence
import numpy as np, matplotlib.pyplot as plot, librosa, librosa.display, sklearn, sys
from sklearn.mixture import GaussianMixture as GMM

In [2]:
#normalize a chunk to a target amplitude.
def match_target_amplitude(aChunk, target_dBFS):
    ''' Normalize given audio chunk '''
    change_in_dBFS = target_dBFS - aChunk.dBFS
    return aChunk.apply_gain(change_in_dBFS)

#silence removal, normalization and trimming
def remove_silence(path):
    # Load your audio.
    song = AudioSegment.from_file(path)
    final_chunk=AudioSegment.empty()
    # Split track where the silence is 3 seconds or more and get chunks using 
    # the imported function.
    chunks = split_on_silence (
        # Use the loaded audio.
        song, 
        # Specify that a silent chunk must be at least 3 seconds or 3000 ms long.
        min_silence_len = 3000,
        # Consider a chunk silent if it's quieter than -16 dBFS.
        # (You may want to adjust this parameter.)
        silence_thresh = -30
    )
    
    # Process each chunk with your parameters
    for i, chunk in enumerate(chunks):

        # Normalize the entire chunk.
        normalized_chunk = match_target_amplitude(chunk, -20.0)

        #final chunk made by joining all non silent chunks
        final_chunk+=normalized_chunk
    
    # trimming the song, taking only 1 sec from beginning
    final_song = final_chunk[:3000]

    #convert the song into numpy array
    song_array = final_song.get_array_of_samples()
    song_array = np.array(song_array)
    return song_array

In [43]:
#extracting mfccs and scaling them
def scaled_mfccs(song_array):
    song_array = song_array.astype(float)
    mfccs = librosa.feature.mfcc(song_array)
    
    #scaling the MFCCs such that each coefficient dimension has zero mean and unit variance
    mfccs = sklearn.preprocessing.scale(mfccs,axis =1)
    return mfccs

In [86]:
#training dataset location text file
location = "/media/abhiroopd/New Volume/audio/"
#5 songs per singer, each has mfccs of size 20x130, we stack them vertically
features = np.empty([100, 130])
count = 1 
i = 0
with open("train.txt", "r") as training_file:
    for path in training_file:
        #remove leading and trailing spaces
        path = path.strip()
        song_array = remove_silence(location+path)
        mfccs = scaled_mfccs(song_array)
        np.set_printoptions(threshold=sys.maxsize)
        
        if(count <= 5):
            features[i:i+20, :] = mfccs
            i = i+20
        if(count == 5):  
            np.set_printoptions(threshold=sys.maxsize)
            gmm =  GMM(n_components=3).fit(features)
            count = 0
            i = 0
        count = count+1    



