### 1. Import packages and training data 

In [1]:
import os
import pandas as pd 
import numpy as np
import openpyxl
import matplotlib.pyplot as plt
import sklearn
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from imblearn.over_sampling import RandomOverSampler
import fsspec

In [13]:
#run this only if you're getting a compatability error with the classifier (joblib); restart kernel afterwards
#!pip install --upgrade joblib

In [14]:
#import training data with syllable characteristics
df = pd.read_excel("/users/yourusername/folder/file_name.xlsx")
df.head()
#you can remove columns unnecessary for training in excel file prior to import or remove using df.drop in next cell
#we used call legnth, low freq, delta freq, peak freq, and mean power for classification, but you can use customize
#which features you want to include

Unnamed: 0,Label,Call Length (s),Low Freq (kHz),Delta Freq (kHz),Frequency Standard Deviation (kHz),Mean Power (dB/Hz),Peak Freq (kHz)
0,Complex 2,0.064128,77.470616,23.338153,7.532348,-65.251415,83.36766
1,Complex 2,0.0475,68.754416,36.293299,8.779526,-79.095868,84.803584
2,Complex 2,0.070808,78.541667,22.728202,5.9275,-56.629946,83.676158
3,Complex 2,0.072896,60.697877,36.788754,9.39641,-69.619878,61.616995
4,Complex 2,0.07722,68.291912,21.64724,5.729276,-60.584904,81.965248


### 2. Oversample minority classes
##### aka weighing data set to account for unbalanced classes. If you are struggling with low accuracy, precision and/or recall, this is a MUST. 

In [16]:
X = df.drop('Label', axis=1)  #drop the 'Label column' so we are left with only the feature columns for training
y = df['Label']  #use the dropped 'Label' column as the training label (aka 'target variable')
#data is structured as (X, y) = (features, label)

In [2]:
#import an oversampler to boost minority classes (ie. call types that aren't as frequently emitted
oversampler = RandomOverSampler()

#oversample our data (X, y) and make new resampled variables 
X_resampled, y_resampled = oversampler.fit_resample(X, y)

NameError: name 'X' is not defined

### 3. Split data into training and testing sets

In [18]:
#if you resampled your data, be sure to use the resampled data as your training set "(_resampled)"
#this uses 70% of the data as the training dataset, and 30% for testing 
#you can try different test sizes and random states and select the combo with the best performance 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

### 3. Fit and train model

In [19]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train) #use training data to learn syllable features
y_pred = rf.predict(X_test) #test the model's perfomance on unseen feature data  

#model performance
accuracy = accuracy_score(y_test, y_pred, normalize = True)
precision = precision_score(y_test, y_pred, average=None, zero_division=1)
recall = recall_score(y_test, y_pred, average=None, zero_division=1)
scores_rf = pd.DataFrame({'Accuracy': accuracy, 'Precision': precision, 'Recall': recall},index=y_test.unique())

#print performance scores (precision and recall) for each syllable
#accuracy is overall accuracy of the model 
print(scores_rf)

                 Accuracy  Precision    Recall
Complex          0.977002   0.951456  0.915888
Chevron          0.977002   0.985939  1.000000
Complex 3        0.977002   0.933759  0.998294
Downward         0.977002   0.995370  0.907173
Short            0.977002   0.990008  1.000000
Flat             0.977002   1.000000  1.000000
Complex 2        0.977002   0.985320  1.000000
Complex 5        0.977002   0.995161  1.000000
Upward           0.977002   0.969497  0.998302
Reverse Chevron  0.977002   0.997575  1.000000
Complex 4        0.977002   0.944351  0.927152


In [22]:
#save model
joblib.dump(rf, "yourclassifier.joblib")

['NA_classifier.joblib']

### Optional: Visualize model's performance 

In [None]:
#count the occurrences of each label in y_test and y_pred
label_counts_test = pd.Series(y_test).value_counts().reset_index()
label_counts_test.columns = ['True Labels', 'True Count']

label_counts_pred = pd.Series(y_pred).value_counts().reset_index()
label_counts_pred.columns = ['Predicted', 'Predicted Count']

#create a dataframe with predicted labels, true labels, and label counts
df = pd.DataFrame({'Predicted': y_pred, 'True Labels': y_test})

#merge the label counts for true and predicted labels with the dataframe
df = pd.merge(df, label_counts_test, on='True Labels', how='left')
df = pd.merge(df, label_counts_pred, on='Predicted', how='left')

print(df)
#save the dataframe to a CSV file
#df.to_csv('labels_comparison_counts.csv', index=False)

### Optional: Feature importance
##### I used this to determine which features  are the most important for classification to improve the model's performance. Some features aren't important (i.e. non-spectrotemporal properties), and we don't want to confuse the model 

In [None]:
importances = rf.feature_importances_
feature_names = X.columns.tolist()
#Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X.shape[1]):
    print(f"{f + 1}. Feature {feature_names[indices[f]]}: {importances[indices[f]]}")

In [None]:
#if you want to visualize it 
plt.figure()
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation='vertical')
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.show()