### In this notebook we connstruct a knn model using a test-train-validatin split which achieves:
### 62% accuracy classifying 'fam_or_subfam' (15 possible values) on unseen test data, and
### 87% accuracy classifying 'critter_name' (3 possible values).

In [2]:
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix

In [3]:
#Import the data file containing all features and classifications
df_all = pd.read_csv('MLNS_05302024.csv')

In [7]:
#Initial test-train split
df, df_test = train_test_split(df_all, shuffle=True, random_state=17, test_size=.2, stratify=df_all['fam_or_subfam'])

In [8]:
#Returns dataframe containing mfcc avg and var, and hs_mfcc avg and var, truncated to indicated depth, as well 
#as main_freq, range, max_mean, and peak_freq if True.
def truncate_mfcc(df, avg_depth=20, var_depth=10, hs_avg_depth=20, hs_var_depth=10, period_depth= 40, other_Features=True):
    df_temp=df['mfcc_'+str(0)+'_avg']
    df_temp=df_temp.rename('blah')
    for n in range(0,avg_depth):
        df_temp=pd.concat([df_temp, df['mfcc_'+str(n)+'_avg']], axis=1)
    for n in range(0,var_depth):
        df_temp=pd.concat([df_temp, df['mfcc_'+str(n)+'_var']], axis=1)
    for n in range(0,hs_avg_depth):
        df_temp=pd.concat([df_temp, df['hs_mfcc_'+str(n)+'_avg']], axis=1)
    for n in range(0,hs_var_depth):
        df_temp=pd.concat([df_temp, df['hs_mfcc_'+str(n)+'_var']], axis=1)
    for n in range(0,hs_var_depth):
        df_temp=pd.concat([df_temp, df['period_'+str(n)]], axis=1)
    if other_Features:
        df_temp=pd.concat([df_temp, df['main_freq']], axis=1)
        df_temp=pd.concat([df_temp, df['range']], axis=1)
        df_temp=pd.concat([df_temp, df['max_mean']], axis=1)
        df_temp=pd.concat([df_temp, df['peak_freq']], axis=1)
    return df_temp.drop(columns=['blah'])

In [9]:
#Computes the accuracy of knn at the given value of k, depths of mfcc features, other_features, and critter vs. fam_or_subfam
def knn_acc(df, k=5, avg_depth=40, var_depth=40, hs_avg_depth=40, hs_var_depth=40, other_Features=True, crit=False):
    X=truncate_mfcc(df=df, avg_depth=avg_depth,var_depth=var_depth,hs_avg_depth=hs_avg_depth,hs_var_depth=hs_var_depth,other_Features=other_Features)
    if crit:
        Y = df['critter_name']
    else:
        Y = df['fam_or_subfam']
    X_train, X_val, y_train, y_val = train_test_split(X.copy(), Y, shuffle=True, random_state=17, test_size=.2, stratify=Y)
    knn_pipe = Pipeline([('scale', StandardScaler()), ('knn', KNeighborsClassifier(k))])
    knn_pipe.fit(X_train, y_train)
    pred_train = knn_pipe.predict(X_train)
    pred_val = knn_pipe.predict(X_val)
    score_train = accuracy_score(y_train,pred_train)
    score_val = accuracy_score(y_val,pred_val)
    return score_train, score_val

### The data below shows that using all of our features narrows the gap between training accuracy and validation accuracy.

In [10]:
#Runs knn over the indicated depth of all 4 mfcc features, including the 4 other features, and the values of k for knn.
for depth in range(1,40,5):
    for k in range(1,30,5):
        print(f"knn (Train, Validation) accuracy when mfcc depth = {depth} and k = {k}: {knn_acc(df,k,depth,depth,depth,depth,True,False)}") 

knn (Train, Validation) accuracy when mfcc depth = 1 and k = 1: (1.0, 0.510593220338983)
knn (Train, Validation) accuracy when mfcc depth = 1 and k = 6: (0.6614569536423841, 0.5434322033898306)
knn (Train, Validation) accuracy when mfcc depth = 1 and k = 11: (0.6193377483443708, 0.5603813559322034)
knn (Train, Validation) accuracy when mfcc depth = 1 and k = 16: (0.5933774834437087, 0.5635593220338984)
knn (Train, Validation) accuracy when mfcc depth = 1 and k = 21: (0.5803973509933775, 0.5699152542372882)
knn (Train, Validation) accuracy when mfcc depth = 1 and k = 26: (0.5695364238410596, 0.5603813559322034)
knn (Train, Validation) accuracy when mfcc depth = 6 and k = 1: (1.0, 0.6038135593220338)
knn (Train, Validation) accuracy when mfcc depth = 6 and k = 6: (0.7215894039735099, 0.614406779661017)
knn (Train, Validation) accuracy when mfcc depth = 6 and k = 11: (0.688476821192053, 0.6027542372881356)
knn (Train, Validation) accuracy when mfcc depth = 6 and k = 16: (0.661986754966887

### We now use all of our features and try to find value of k which maximizes the validation accuracy.

In [12]:
for k in range(1,50):
    print(f"knn (Train, Validation) accuracy when k = {k}: {knn_acc(df,k,40,40,40,40,True,False)}") 

knn (Train, Validation) accuracy when k = 1: (1.0, 0.6197033898305084)
knn (Train, Validation) accuracy when k = 2: (0.808476821192053, 0.5985169491525424)
knn (Train, Validation) accuracy when k = 3: (0.783046357615894, 0.6101694915254238)
knn (Train, Validation) accuracy when k = 4: (0.7560264900662251, 0.614406779661017)
knn (Train, Validation) accuracy when k = 5: (0.7398675496688741, 0.6101694915254238)
knn (Train, Validation) accuracy when k = 6: (0.7218543046357616, 0.611228813559322)
knn (Train, Validation) accuracy when k = 7: (0.710728476821192, 0.611228813559322)
knn (Train, Validation) accuracy when k = 8: (0.696953642384106, 0.611228813559322)
knn (Train, Validation) accuracy when k = 9: (0.6916556291390729, 0.6228813559322034)
knn (Train, Validation) accuracy when k = 10: (0.6845033112582781, 0.6228813559322034)
knn (Train, Validation) accuracy when k = 11: (0.6770860927152318, 0.6165254237288136)
knn (Train, Validation) accuracy when k = 12: (0.671523178807947, 0.6144067

In [6]:
#Runs knn over the indicated depth of all 4 mfcc features, including the 4 other features, and the values of k for knn.
for depth in range(1,40,5):
    for k in range(1,30,5):
        print(f"knn (Train, Validation) accuracy when mfcc depth = {depth} and k = {k}: {knn_acc(df,k,depth,depth,depth,depth, depth, True,False)}") 

NameError: name 'df' is not defined

### It appears that k=5 gives the best validation accuracy. We now test our final knn model using k=5 and all features.

In [13]:
#First train the model exactly using the same training data as before, and k=5.
knn_pipe = Pipeline([('scale', StandardScaler()), ('knn', KNeighborsClassifier(10))])
X=truncate_mfcc(df=df, avg_depth=40,var_depth=40,hs_avg_depth=40,hs_var_depth=40,other_Features=True)
Y = df['fam_or_subfam']
X_train, X_val, y_train, y_val = train_test_split(X.copy(), Y, shuffle=True, random_state=17, test_size=.2, stratify=Y)
knn_pipe.fit(X_train, y_train)

#Test the model on unseen test data.
X_test=truncate_mfcc(df=df_test, avg_depth=40,var_depth=40,hs_avg_depth=40,hs_var_depth=40,other_Features=True)
y_test=df_test['fam_or_subfam']
pred = knn_pipe.predict(X_test)
accuracy_score(pred,y_test)

0.6101694915254238

### Our knn model with k=5 achieves 62% accuracy identifying 'fam_or_subfam' on unseen test data. Finally, we test our model on identifying critter_name: 'cricket', 'kaydid', or 'cicada'.

In [14]:
#Dictionary from 'fam_or_subfam' to the coarser classification 'critter_name'
fam_dict = {'Gryllinae':'cricket', 'Conocephalinae':'kaydid', 'Oecanthinae':'cricket',
            'Phaneropterinae': 'kaydid', 'Trigonidiinae':'cricket', 'Nemobiinae':'cricket', 'Hapithinae':'cricket', 
            'Mogoplistinae':'cricket', 'Tettigoniinae':'kaydid', 'Pseudophyllinae':'kaydid', 'Cicadidae':'cicada',
            'Gryllotalpidae':'cricket', 'Eneopterinae':'cricket', 'Phalangopsidae':'cricket', 'Listroscelidinae':'cricket'}

#Converts a pd Series with fam_or_subfam entries into critter names
def fam_to_crit(series):
    X=series.copy()
    for i in range(0, X.shape[0]):
        X.at[i]=fam_to_crit_string(X[i])
    return X

def fam_to_crit_string(fam_name):
    return fam_dict[fam_name]

In [15]:
accuracy_score(fam_to_crit(pd.Series(list(y_test))),fam_to_crit(pd.Series(pred)))

0.8567796610169491

### Our knn model with k=5 achieves 87% accuracy identifying (cricket, kaydid, or cicada) on unseen test data.

In [16]:
conf_mat = confusion_matrix(fam_to_crit(pd.Series(list(y_test))), fam_to_crit(pd.Series(pred)))

In [17]:
pd.DataFrame(conf_mat,
                 columns = ['Predicted cicada', 'Predicted cricket', 'Predicted kaydid'],
                 index = ['Actual cicada', 'Actual cricket', 'Actual kaydid'])

Unnamed: 0,Predicted cicada,Predicted cricket,Predicted kaydid
Actual cicada,0,1,10
Actual cricket,3,667,119
Actual kaydid,1,35,344
