IMPORT LIBRARIES

In [3]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from IPython.display import Audio
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from scipy.stats import skew,kurtosis
import pandas as pd
import numpy as np
import librosa
import os


#auto correlation function at lag k

#it extracts the relationship between past and future data in a time series : about 10 autocorrelations are usually valuable
def calculate_auto_corr(time_series_signal,k):
    series_mean=np.mean(time_series_signal)

    l=len(time_series_signal)

    autocorrelation_num=np.sum((time_series_signal[:l-k]-series_mean)*(time_series_signal[k:]-series_mean))
    autocorrelation_den=np.sum((time_series_signal-series_mean)**2)

    return autocorrelation_num/(autocorrelation_den+1e6)

TO DO 

In [None]:
# change n_components to 30 each                           (decreases 8%)-------- optimal at n=2 (26%)
# upsampling,downsampling                                  (decreases 1%)   
# try cross-validation                                     (+1-4% extra in cross validations)
# add features (spectral centroid etc)                     (29% accuracy now)
# apply PCA or other stats (variance, median,iqr)               47.6%  
# normalization                                                 bane (decreases) 2-3%
# also check :- autocorrelation* ,fft  (kurtosis,skew)      ~60%

# add PLP, LPCC features                            
# weighted model (ensemble)                        
# remove background noise in some way  (band pass filter) 

OBSERVATIONS

In [None]:
# 13 mel mean, (chroma, zcr, spec_bw, spectral cent, rolloff) means -      26%  [18 features] - 16min
# 13 mel mean,var                             (" "  "  " ~ 5) means -      39%  [31 features] - 16min
# 13 mel mean,var      (zcr, spec_bw, spectral cent, rolloff) means -      40%  [30 features] - 20min
# 13 mel mean,var,pca  (zcr, spec_bw, spectral cent, rolloff) means -      42%  [30 features] - 23min
# 13 mel mean,median,var,pca  (" " " " ~ $")                  means -      44%  [56 features] - 16min
# -------------------------------------------------------------------------------------------------- 
# 13 mel mean,var,median        (" "  "  " ~  5) var, median, means -      47.8%  [55 features] - 42min
# 13 mel mean,var,median,pca    (" "  "  " ~  4) var, median, means -          %  [64 features] -   min (do at the end to boost by 2-3%)
# 13 mel mean,var,med,autocorr,pca,iq(" "  "  " ~  4) var, median, means-    54%  [90 features] - 19min
# 13 mel mean,var,med,pca,iq + (15X13autocorr) + (" "  "  " ~  4) ""  ~3-    65%  [272 features]- 40+9 min
# 13 mel mean,var,med,pca,iq, + (15X13autocorr)+ (13ffts(44mean)) + (" "  "  " ~  4) ""-  64%  [285 features]- 30+9 min


# 13 mel mean,var,med,pca,iq,skew,kurtosis + (15X13autocorr) + (" "  "  " ~  4) ""    -  66%  [311 features]-  min
# use min-max scaler on the best accuracy - 64% :(         

PROPERTIES

In [None]:
# MEL features - They represent the short-term power spectrum of a sound signal of the audio signal in a way that is more aligned with human auditory perception.          If a cepstral coefficient has a positive value, the majority of the spectral energy is concentrated in the low-frequency regions. On the other hand, if a cepstral coefficient has a negative value, it represents that most of the spectral energy is concentrated at high frequencies.
# spectral centroid - center of mass of spectrum (brightness of sound)
# spectral rolloff - frequency below which a specified percentage of the total spectral energy lies  (also provides info about center)
# spectral bandwidth -information about how spread out the frequencies are
# zcr - the number of times the signal crosses zero per unit of time. (noisiness of signal)
# Autocorrelation - It extracts the relationship between past and future data in a time series : about 10 autocorrelations are usually valuable
# FFT -  convert time domain to frequency domain

FEATURE EXTRACTION

In [9]:
def extract_MFCC_features(file_path):
    audio,sr= librosa.load(file_path)        
    mfcc_features=librosa.feature.mfcc(y=audio,sr=sr,n_mfcc=13)
    auto_list=[]    # List to store the autocorrelation features
    four_list=[]
    
    
    # spectral features
    zcr = librosa.feature.zero_crossing_rate(audio)
    spec_bw = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=audio, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
    
    
    pca=PCA(n_components=1)


    # statistical features
    mean_per_feature=np.mean(mfcc_features,axis=1).tolist()
    var_per_feature=np.var(mfcc_features,axis=1).tolist()
    median_per_feature=np.median(mfcc_features,axis=1).tolist()
    mfcc_pca=pca.fit_transform(mfcc_features).flatten().tolist()
    iqr_per_feature=(np.percentile(mfcc_features,75,axis=1)-np.percentile(mfcc_features,25,axis=1)).tolist() #inter-quartile percentile
    skew_per_feature=skew(mfcc_features,axis=1).tolist()
    kurtosis_per_feature=kurtosis(mfcc_features,axis=1).tolist()
    
    # Extracting 15 autocorrelation coefficients for each mel feature 
    for a in mfcc_features:
        for i in range (1,16):
            curr_lag=calculate_auto_corr(a,i)
            auto_list.append(curr_lag)
    
    # Extracting 13 Fourier features (a mean of 44 for each mel feature)
    for a in mfcc_features:
        curr_fft=np.abs(np.fft.fft(a)).tolist()
        four_list.append(np.mean(curr_fft))
    
    
    features=mean_per_feature+var_per_feature+median_per_feature+mfcc_pca+iqr_per_feature+auto_list+four_list+skew_per_feature+kurtosis_per_feature+[np.mean(spec_cent),np.mean(rolloff),np.mean(spec_bw),np.mean(zcr),np.var(spec_cent),np.var(rolloff),np.var(spec_bw),np.var(zcr),np.median(spec_cent),np.median(rolloff),np.median(spec_bw),np.median(zcr)]   # 8*13+ 15*13 +3*4= 311 features
    return features




labels=['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']
class_to_label={'right': 0, 'eight': 1, 'cat': 2, 'tree': 3, 'bed': 4, 'happy': 5, 'go': 6, 'dog': 7, 'no': 8, 'wow':9, 'nine': 10, 'left': 11, 'stop': 12, 'three': 13, 'sheila': 14, 'one': 15, 'bird': 16, 'zero': 17, 'seven': 18, 'up': 19,'marvin': 20, 'two': 21, 'house': 22, 'down': 23, 'six': 24, 'yes': 25, 'on': 26, 'five': 27, 'off': 28, 'four': 29}
label_to_class={0: 'right', 1: 'eight', 2: 'cat', 3: 'tree', 4: 'bed', 5: 'happy', 6: 'go', 7: 'dog', 8: 'no', 9: 'wow', 10: 'nine', 11: 'left', 12: 'stop', 13: 'three', 14: 'sheila', 15: 'one', 16: 'bird', 17: 'zero', 18: 'seven', 19: 'up', 20: 'marvin', 21: 'two', 22: 'house', 23: 'down', 24: 'six', 25: 'yes', 26: 'on', 27: 'five', 28: 'off', 29: 'four'}
alpha_order_to_label_order={}

# Mapping
for a in range(0,len(labels)): 
    alpha_order_to_label_order[a]=class_to_label[labels[a]]


In [2]:
# Function to extract features from audio files (submission20.csv)
#  20-25min
source_folder="SpeechCommand_Copy/"

X_train_per_class=[]
y_train_per_class=[]
desired_examples_per_class=2178
scaler=MinMaxScaler(feature_range=(0,1))
cnt=0
i=0
for class_folder in os.listdir(source_folder):
    if class_folder=="_background_noise_": continue
    
    curr_class_path=os.path.join(source_folder,class_folder)
    curr_class_audio_paths=os.listdir(curr_class_path)

    
    X_train_per_class.append([])
    labels.append(class_folder)

    for audio_path in curr_class_audio_paths:
        f=extract_MFCC_features(os.path.join(curr_class_path,audio_path))
        X_train_per_class[i].append(f)
        cnt+=1
        
        if (cnt%10000==0): print(cnt)
    
    
    # X_train_per_class[i]=scaler.fit_transform(X_train_per_class[i])
    i+=1

10000
20000
30000
40000
50000


TRAIN GMMs and do mapping

In [4]:
# Train a Gaussian Mixture Model
gmms=[]
for i in range(0,30): # class 0 is bed in X_TRAIN
    gmm=GaussianMixture(n_components=2)
    gmm.fit(X_train_per_class[i])
    gmms.append(gmm)

MAKE PREDICTIONS

In [12]:
# 2min
test_source_folder="SpeechCommandTest/"
test_csv_input="test.csv"
output_file_name="submissions.csv"
X_test=[]



# for each id in test.csv_input see the path and use it to extract audio from kaggle drive link
# Then open that file and extract features and put in X_test
# make predictions and update in test.csv_output

df2=pd.read_csv(test_csv_input)
for id, row in df2.iterrows():
    audio_path=os.path.join(test_source_folder,row["AUDIO_FILE"])
    X_test.append(extract_MFCC_features(audio_path))
X_test=np.array(X_test)
# X_test=scaler.fit_transform(X_test)



# Predict labels on the test set
y_pred=[]
for a in X_test:
    likelihoods_per_class=[gmm.score(a.reshape(1,-1)) for gmm in gmms]
    predicted_class=np.argmax(likelihoods_per_class)
    y_pred.append(alpha_order_to_label_order[predicted_class])
y_pred=np.array(y_pred)
     


output_list=[]
for i, row in df2.iterrows():
    curr_id=row["ID"]
    prediction=y_pred[i]
    output_list.append([curr_id,prediction])
output_df=pd.DataFrame(output_list, columns=['ID', 'TARGET'])




# Write the DataFrame to a new CSV file
counter=1  
while True:
    output_file_name=f"submissions/submissions_{counter}.csv"
    try:
        with open(output_file_name, "r"):
            counter += 1
            
    except FileNotFoundError:
        output_df.to_csv(output_file_name, index=False)
        break
output_df.to_csv(output_file_name, index=False)

<br><br><br>
CROSS VALIDATION

In [3]:
# 1min
y_train_per_class=[]

# TRAIN test split
X_train_per_class_validated=[]
X_test_validated=[]
y_test_validated=[]

for i in range(0,30): # class 0 is bed in X_TRAIN
    y_train_per_class.append([])
    y_train_per_class[i]=[alpha_order_to_label_order[i]]*len(X_train_per_class[i])
    X_train_split, X_test_split,Y_train_split,Y_test_split =train_test_split(X_train_per_class[i],y_train_per_class[i], test_size=0.2, random_state=i)

    X_train_per_class_validated.append(X_train_split)
    X_test_validated+=X_test_split
    y_test_validated+=[alpha_order_to_label_order[i]]*len(X_test_split)
    
X_test_validated=np.array(X_test_validated)

    
    
    
# Train 30 Gaussian Mixture Models
gmms=[]
for i in range(0,30): # class 0 is bed in X_TRAIN
    gmm=GaussianMixture(n_components=2)
    gmm.fit(np.array(X_train_per_class_validated[i]))
    gmms.append(gmm)
    
    
    

# Predict labels on the test set
y_pred=[]
for a in X_test_validated:
    likelihoods_per_class=[gmm.score(a.reshape(1,-1)) for gmm in gmms]
    predicted_class=np.argmax(likelihoods_per_class)
    y_pred.append(alpha_order_to_label_order[predicted_class])
y_pred=np.array(y_pred)
     
     
accuracy_score=0
correct=0
total=len(X_test_validated)
for a in range(0,len(y_pred)):
    if (y_pred[a]==y_test_validated[a]):
        correct+=1
print(f"Accuracy= {(correct/total)*100}")

Accuracy= 71.62301860785665


<br><br><br> PICKLING AND TESTING SCRIPT

In [1]:
import pickle                               # works only for generating csv file 

# Save GMM models to a pickle file
# with open('gmm_models.pkl', 'wb') as file:
#     pickle.dump(gmms, file)
    
# save X_train data
# with open('train.pkl', 'wb') as file:
#     pickle.dump(X_train_per_class, file)

In [11]:
with open('2021452/gmm_models.pkl', 'rb') as file:
    gmms = pickle.load(file)
    l=["SpeechCommandTest/A_3534412c675c_nohash_0.wav"]
    preds=[]
    for audio_path in l:
        audio,sr=librosa.load(audio_path)
        # Audio(data=audio, rate=sr) 
        feature=np.array(extract_MFCC_features(audio_path))
        
        
        likelihoods_per_class=[gmm.score(feature.reshape(1,-1)) for gmm in gmms]
        predicted_class=np.argmax(likelihoods_per_class)
        predicted_class=alpha_order_to_label_order[predicted_class]
        # print(predicted_class)
        # print(label_to_class[predicted_class])
        preds.append(predicted_class)
    print(preds)
   

[15]


TESTING PREDICTIONS BY AUDIO

In [22]:
file="SpeechCommandTest/A_3534412c675c_nohash_0.wav"
audio,sr= librosa.load(file)
Audio(data=audio, rate=sr) 