# Cogs 118b Final Project 
## Comparing and Contrasting the Performance of Different Machine Learning Models on clustering and classifying animal sounds 

Group Members:
- Anand Mysorekar
- Alex Franz
- Jack Determan
- Austin Blanco

# Abstract

discuss what we tried and what we found high level

# Introduction

discuss clustering and classification as a task and the data we used 

# Methods

discuss the dataset (samples, instances, etc) and the models we used and the metrics we used to evaluate them

# Data Preprocessing, Feature Extraction, PCA, and Clustering



## Imports

In [None]:
import os
import warnings
import subprocess
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from scipy.io import wavfile
from skimage.transform import resize
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, 
    completeness_score, v_measure_score, classification_report, confusion_matrix
)
from minisom import MiniSom
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, TimeDistributed, GlobalAveragePooling2D, 
    Attention, Bidirectional
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import RMSprop, AdamW
from tensorflow.keras.applications import MobileNet



## Extracting Features using Librosa

Some commonly used audio features for classification include:

* **MFCCs (Mel-Frequency Cepstral Coefficients)**: Captures the timbral aspects of the sound, which is crucial for distinguishing sounds with similar pitches but different characteristics
* **Spectral Features**: Describes the shape of the spectrum and provide insights into the distribution of energy across frequencies
    * **Spectral Centroid**: Associated with brightness. Sounds with high spectral centroids (e.g., cymbals) are perceived as brighter, while sounds with low centroids (e.g., bass) are darker.
    * **Spectral Bandwidth**: A high bandwidth means the sound contains a wide range of frequencies (e.g., noise), while a narrow bandwidth suggests a pure tone
    * **Spectral Rolloff**: Indicates the "tail" of the spectrum. Useful for distinguishing tonal sounds (low rolloff) from noise-like sounds (high rolloff)
    * **Zero-Crossing Rate**: Higher rates are associated with noisier or percussive sounds, while lower rates occur in harmonic or tonal sounds
* **RMS Energy**: Loudness
*  **Chroma Features**: Rate of sign changes in the waveform 

In [None]:
def extract_features(file_path):

    y, sr = librosa.load(file_path, sr=None) 

    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  
    mfccs_mean = np.mean(mfccs, axis=1)  

    # Spectral features
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=y))

    # RMS energy
    rms = np.mean(librosa.feature.rms(y=y))

    # Combine all features into a single vector
    features = np.hstack([
        mfccs_mean,
        spectral_centroid,
        spectral_bandwidth,
        spectral_rolloff,
        zero_crossing_rate,
        rms
    ])
    
    return features

def process_audio_directory(directory_path):

    data = []

    for file_name in os.listdir(directory_path):
        if file_name.endswith('.wav'):  
            file_path = os.path.join(directory_path, file_name)
            
            try:
                features = extract_features(file_path)
                
                data.append([file_name] + list(features))
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    
    mfcc_columns = [f"mfcc_{i+1}" for i in range(13)]
    other_columns = ["spectral_centroid", "spectral_bandwidth", "spectral_rolloff", "zero_crossing_rate", "rms"]
    columns = ["file_name"] + mfcc_columns + other_columns

    df = pd.DataFrame(data, columns=columns)

    return df

def process_audio_directory_recursive(directory_path):
    
    data = []

    for root, _, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith('.wav'): 
                file_path = os.path.join(root, file_name)
                
                try:
                    features = extract_features(file_path)
                    
                    data.append([file_name, file_path] + list(features))
                except Exception as e:
                    print(f"Error processing {file_name}: {e}")
    
    mfcc_columns = [f"mfcc_{i+1}" for i in range(13)]
    other_columns = ["spectral_centroid", "spectral_bandwidth", "spectral_rolloff", "zero_crossing_rate", "rms"]
    columns = ["file_name", "file_path"] + mfcc_columns + other_columns

    df = pd.DataFrame(data, columns=columns)
    return df

def plot_and_compute_metrics(df, model, expected_clusters, expected_labels):

    plt.figure(figsize=(10, 6))
    sns.heatmap(model, annot=True, fmt='d', cmap='Blues', xticklabels=expected_clusters, yticklabels=expected_labels)
    plt.xlabel('Cluster')
    plt.ylabel('Animal Label')
    plt.title('Counts of Animal Labels per Cluster')
    plt.show()

    ari = adjusted_rand_score(df['animal_label'], df['cluster'])
    print(f"Adjusted Rand Index (ARI): {ari:.2f}")

    nmi = normalized_mutual_info_score(df['animal_label'], df['cluster'])
    print(f"Normalized Mutual Information (NMI): {nmi:.2f}")

    homogeneity = homogeneity_score(df['animal_label'], df['cluster'])
    print(f"Homogeneity Score: {homogeneity:.2f}")

    completeness = completeness_score(df['animal_label'], df['cluster'])
    print(f"Completeness Score: {completeness:.2f}")

    v_measure = v_measure_score(df['animal_label'], df['cluster'])
    print(f"V-Measure Score: {v_measure:.2f}")

In [None]:
bear_file = "Animal Sounds/Bear/Bear_1.wav"
bear_dir = "Animal Sounds/Bear"
animal_dir = "Animal Sounds"
animals = ["Animal Sounds/Bear/Bear_1.wav","Animal Sounds/Cat/Cat_1.wav","Animal Sounds/Chicken/Chicken_1.wav","Animal Sounds/Lion/Lion_1.wav"]

In [None]:
bear_sample = extract_features(bear_file)
print(bear_sample)

all_data = process_audio_directory_recursive(animal_dir)
all_data

### Run PCA on the librosa-extracted features

In [None]:
# Drop non-numeric columns
animal_features = all_data.drop(columns=["file_name","file_path"])  

# Standardize the data
scaler = StandardScaler()
animal_scaled = scaler.fit_transform(animal_features)

# Apply PCA 
pca = PCA(n_components=0.95)  
principal_components_librosa = pca.fit_transform(animal_scaled)

# Create a DataFrame for PCA results
pca_df = pd.DataFrame(data=principal_components_librosa)

print(f"Number of components selected: {pca.n_components_}")
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2f}")

pca_df['file_name'] = all_data['file_name'].values
pca_df['animal_label'] = pca_df['file_name'].str.split('_').str[0]
pca_df = pca_df.drop(columns=["file_name"]) 
print(pca_df.head())

# Plot the cumulative explained variance
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label="95% Explained Variance")
plt.legend(loc='best')
plt.grid()
plt.show()

### Perform KMeans clustering on the principal components

In [None]:
# Run K-means clustering
kmeans = KMeans(n_clusters=10, random_state=42)
pca_df['cluster'] = kmeans.fit_predict(principal_components_librosa)

# Create a contingency matrix
contingency_matrix_k_means = pd.crosstab(pca_df['animal_label'], pca_df['cluster'])

# Ensure all clusters and labels are represented
expected_clusters = range(kmeans.n_clusters)
expected_labels = sorted(pca_df['animal_label'].unique())

# Reindex the contingency matrix to include all expected clusters and labels
contingency_matrix_k_means = contingency_matrix_k_means.reindex(index=expected_labels, columns=expected_clusters, fill_value=0)
print(contingency_matrix_k_means)

plot_and_compute_metrics(pca_df, contingency_matrix_k_means, expected_clusters, expected_labels)

### Perform Spectral Clustering on the principal components

In [None]:
# Run Spectral Clustering
spectral = SpectralClustering(n_clusters=10, affinity='nearest_neighbors', random_state=42)
pca_df['cluster'] = spectral.fit_predict(principal_components_librosa)

# Create a contingency matrix
contingency_matrix_spectral = pd.crosstab(pca_df['animal_label'], pca_df['cluster'])

# Ensure all clusters and labels are represented
expected_clusters = range(9)  
expected_labels = sorted(pca_df['animal_label'].unique())

# Reindex the contingency matrix to include all expected clusters and labels
contingency_matrix_spectral = contingency_matrix_spectral.reindex(index=expected_labels, columns=expected_clusters, fill_value=0)
print(contingency_matrix_spectral)

plot_and_compute_metrics(pca_df, contingency_matrix_spectral, expected_clusters, expected_labels)

### Perform GMM clustering on the principal components

In [None]:
# Run Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=10, random_state=42)
pca_df['cluster'] = gmm.fit_predict(principal_components_librosa)

# Create a contingency matrix
contingency_matrix_gmm = pd.crosstab(pca_df['animal_label'], pca_df['cluster'])

# Ensure all clusters and labels are represented
expected_clusters = range(9) 
expected_labels = sorted(pca_df['animal_label'].unique())

# Reindex the contingency matrix to include all expected clusters and labels
contingency_matrix_gmm = contingency_matrix_gmm.reindex(index=expected_labels, columns=expected_clusters, fill_value=0)
print(contingency_matrix_gmm)

plot_and_compute_metrics(pca_df, contingency_matrix_gmm, expected_clusters, expected_labels)

### Perform Self-Organizing Maps clustering on the principal components

In [None]:
# Standardize the data 
scaler = StandardScaler()
animal_scaled = scaler.fit_transform(principal_components_librosa) 

# Initialize the SOM
som = MiniSom(x=10, y=10, input_len=animal_scaled.shape[1], sigma=1.0, learning_rate=0.5, random_seed=42)

# Train the SOM
som.train(animal_scaled, 1000, verbose=True)

# Plot the SOM's U-matrix
# U-matrix visualizes the distance between the neurons in the SOM grid. Larger distances mean dissimilarity.
plt.figure(figsize=(8, 6))
plt.title("U-Matrix of SOM")
sns.heatmap(som.distance_map().T, cmap='coolwarm', cbar=False)
plt.show()

# Plot the data points on the SOM grid
plt.figure(figsize=(8, 6))
sns.heatmap(som.distance_map().T, cmap='coolwarm', cbar=False)

# Assign each data point to a neuron (winning node)
win_map = som.win_map(animal_scaled)

# We can use the same colors to show where the data points map on the SOM grid
for label in range(len(animal_scaled)):
    x, y = som.winner(animal_scaled[label])
    plt.text(x, y, str(label), color='black', fontsize=10)

plt.title('Data Points Mapped to SOM Grid')
plt.show()

# Visualize the SOM clusters
# Cluster the data points by grouping based on their winning node
# You can assign each point a cluster label (i.e., the coordinates of the winning node)
labels_som = np.array([som.winner(x) for x in animal_scaled])

# Convert coordinates to cluster labels
cluster_labels = np.array([f'Cluster_{x[0]}_{x[1]}' for x in labels_som])

pca_df['SOM_cluster'] = cluster_labels

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=animal_scaled[:, 0], y=animal_scaled[:, 1], hue=pca_df['SOM_cluster'], palette='Set1', s=100)
plt.title("Clustering on SOM Grid")
plt.show()

# Clustering performance metrics
ari = adjusted_rand_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Adjusted Rand Index (ARI): {ari:.2f}")

nmi = normalized_mutual_info_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Normalized Mutual Information (NMI): {nmi:.2f}")

homogeneity = homogeneity_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Homogeneity: {homogeneity:.2f}")

completeness = completeness_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Completeness: {completeness:.2f}")

v_measure = v_measure_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"V-Measure: {v_measure:.2f}")

### Results of Librosa Feature Extraction Clustering Results


put evaluation metrics for each model here in a table just for easy viewing

## Processing the File using a Spectrogram 

talk about the spectrogram and how we used it to extract features

In [None]:
def plot_spectrograms_grid(audio_files, rows, cols, figsize=(15, 10), cmap='viridis'):
    
    if rows * cols < len(audio_files):
        raise ValueError("Grid size (rows * cols) is smaller than the number of audio files.")
    
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=figsize)
    axes = axes.flatten() 

    for i, audio_file in enumerate(audio_files):
        y, sr = librosa.load(audio_file)
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

        librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap=cmap, ax=axes[i])
        axes[i].set_title(f'Spectrogram: {audio_file}')
        axes[i].set_xlabel('Time (s)')
        axes[i].set_ylabel('Frequency (Hz)')

    for j in range(len(audio_files), len(axes)):
        axes[j].axis('off')

    fig.colorbar(plt.cm.ScalarMappable(cmap=cmap), ax=axes, format='%+2.0f dB', orientation='vertical', fraction=0.02, pad=0.04)
    plt.show()

def process_wav_directory_to_spectrogram_df(root_dir):
   
    data = []
    file_paths = []

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(subdir, file)
                try:
                    y, sr = librosa.load(file_path, sr=None)  # Use original sample rate
                    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
                    S_dB = librosa.power_to_db(S, ref=np.max)  # Convert to dB scale
                    
                    flattened_spectrogram = S_dB.flatten()
                    data.append(flattened_spectrogram)
                    file_paths.append(file_path)
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

    df = pd.DataFrame(data)
    df['file_path'] = file_paths  

    return df

def extract_label(file_path):
    
    parts = os.path.normpath(file_path).split(os.sep)
    return parts[1]  


In [None]:
plot_spectrograms_grid(animals,rows=2,cols=2)

In [None]:
spectrogram_data = process_wav_directory_to_spectrogram_df(animal_dir)
spectrogram_data = spectrogram_data.fillna(0)
spectrogram_data

file_path = 'Animal Sounds/Cat/Cat_17.wav'
label = extract_label(file_path)
print(label)  

### Run PCA on the Spectrogram-Extracted Features

In [None]:
spectrogram_data_num = spectrogram_data.drop(columns="file_path") 

# Standardize the data
scaler = StandardScaler()
spectrogram_scaled = scaler.fit_transform(spectrogram_data_num)

# Apply PCA 
pca = PCA(n_components=0.95)  
principal_components_spectrogram = pca.fit_transform(spectrogram_data_num)

pca_df = pd.DataFrame(data=principal_components_spectrogram)

print(f"Number of components selected: {pca.n_components_}")
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2f}")

pca_df['file_path'] = spectrogram_data['file_path'].values
pca_df['animal_label'] = pca_df['file_path'].apply(extract_label)
pca_df = pca_df.drop(columns=["file_path"]) 
print(pca_df.head())

# Plot the cumulative explained variance
plt.figure(figsize=(8, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label="95% Explained Variance")
plt.legend(loc='best')
plt.grid()
plt.show()

### Perform KMeans clustering on the principal components

In [None]:
# Apply K-Means
kmeans = KMeans(n_clusters=10, random_state=42)
pca_df['cluster'] = kmeans.fit_predict(principal_components_spectrogram)

# Create a Contingency Matrix
contingency_matrix_k_means = pd.crosstab(pca_df['animal_label'], pca_df['cluster'])

# Ensure all clusters and labels are represented
expected_clusters = range(kmeans.n_clusters)
expected_labels = sorted(pca_df['animal_label'].unique())
contingency_matrix_k_means = contingency_matrix_k_means.reindex(index=expected_labels, columns=expected_clusters, fill_value=0)
print(contingency_matrix_k_means)

plot_and_compute_metrics(pca_df, contingency_matrix_k_means, expected_clusters, expected_labels)

### Perform Spectral clustering on the principal components

In [None]:
# Apply Spectral Clustering
spectral_clustering = SpectralClustering(
    n_clusters=10, 
    affinity='nearest_neighbors', 
    random_state=42, 
    assign_labels='kmeans'
)
pca_df['cluster'] = spectral_clustering.fit_predict(principal_components_spectrogram)

# Create a Contingency Matrix
contingency_matrix_spectral = pd.crosstab(pca_df['animal_label'], pca_df['cluster'])

# Ensure all clusters and labels are represented
expected_clusters = range(10) 
expected_labels = sorted(pca_df['animal_label'].unique())
contingency_matrix_spectral = contingency_matrix_spectral.reindex(index=expected_labels, columns=expected_clusters, fill_value=0)
print(contingency_matrix_spectral)

plot_and_compute_metrics(pca_df, contingency_matrix_spectral, expected_clusters, expected_labels)

### Perform GMM clustering on the principal components

In [None]:
# Apply Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=10, random_state=42)  
pca_df['cluster'] = gmm.fit_predict(principal_components_spectrogram)

# Create a Contingency Matrix
contingency_matrix_gmm = pd.crosstab(pca_df['animal_label'], pca_df['cluster'])

# Ensure all clusters and labels are represented
expected_clusters = range(gmm.n_components)
expected_labels = sorted(pca_df['animal_label'].unique())
contingency_matrix_gmm = contingency_matrix_gmm.reindex(index=expected_labels, columns=expected_clusters, fill_value=0)
print(contingency_matrix_gmm)

plot_and_compute_metrics(pca_df, contingency_matrix_gmm, expected_clusters, expected_labels)

### Perform Self-Organizing Maps clustering on the principal components

In [None]:
# Standardize the data 
scaler = StandardScaler()
animal_scaled = scaler.fit_transform(principal_components_spectrogram) 

# Initialize the SOM
som = MiniSom(x=10, y=10, input_len=animal_scaled.shape[1], sigma=1.0, learning_rate=0.5, random_seed=42)

# Train the SOM
som.train(animal_scaled, 1000, verbose=True)

# Plot the SOM's U-matrix
# U-matrix visualizes the distance between the neurons in the SOM grid. Larger distances mean dissimilarity.
plt.figure(figsize=(8, 6))
plt.title("U-Matrix of SOM")
sns.heatmap(som.distance_map().T, cmap='coolwarm', cbar=False)
plt.show()

# Plot the data points on the SOM grid
plt.figure(figsize=(8, 6))
sns.heatmap(som.distance_map().T, cmap='coolwarm', cbar=False)

# Assign each data point to a neuron (winning node)
win_map = som.win_map(animal_scaled)

# We can use the same colors to show where the data points map on the SOM grid
for label in range(len(animal_scaled)):
    x, y = som.winner(animal_scaled[label])
    plt.text(x, y, str(label), color='black', fontsize=10)

plt.title('Data Points Mapped to SOM Grid')
plt.show()

# Visualize the SOM clusters
# Cluster the data points by grouping based on their winning node
# You can assign each point a cluster label (i.e., the coordinates of the winning node)
labels_som = np.array([som.winner(x) for x in animal_scaled])

# Convert coordinates to cluster labels
cluster_labels = np.array([f'Cluster_{x[0]}_{x[1]}' for x in labels_som])

pca_df['SOM_cluster'] = cluster_labels

# Visualize the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x=animal_scaled[:, 0], y=animal_scaled[:, 1], hue=pca_df['SOM_cluster'], palette='Set1', s=100)
plt.title("Clustering on SOM Grid")
plt.show()

# Clustering performance metrics
ari = adjusted_rand_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Adjusted Rand Index (ARI): {ari:.2f}")

nmi = normalized_mutual_info_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Normalized Mutual Information (NMI): {nmi:.2f}")

homogeneity = homogeneity_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Homogeneity: {homogeneity:.2f}")

completeness = completeness_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"Completeness: {completeness:.2f}")

v_measure = v_measure_score(pca_df['animal_label'], pca_df['SOM_cluster'])
print(f"V-Measure: {v_measure:.2f}")

### Results of Spectrogram Feature Extraction Clustering Results

put evaluation metrics for each model here in a table just for easy viewing

# Clustering Results

for each method of feature extraction talk about the results of the clustering algorithms (compare them to each other for the given feature extraction method). also talk about for a given clustering method why it works better or worse for a given feature extraction method. talk about how clustering didn't seem to work too well for either feature extraction method and it didn't seem likely that we'd get good results classifying from the clusters. we pivoted to a cnn feature extractor and various classifiers

# Using cnn as feature extractor and various classifiers (word better)
we used mobile net pretrained (talk about what mobile net is and why we thought this would be good)

For the feature extraction model, we're using the MobileNet convolutional neural network (CNN). This model is an efficient CNN pre-trained on the ImageNet dataset. It's optimized for small datasets like the one we are using in this study. We add in a GlobalAveragePooling2D layer as well as a Dense layer for the feature extraction layers.

#### Loading and preprocessing functions

In [None]:
def preprocess_audio_file(file_path, target_shape=(224, 224)):

    try:
        y, sr = librosa.load(file_path, sr=None)
        
        if y is None or len(y) == 0:
            raise ValueError(f"Audio data is empty for file: {file_path}")
        
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        resized_spec = resize(mel_spec_db, target_shape, mode='constant')
        
        rgb_spec = np.stack([resized_spec] * 3, axis=-1) / 255.0 

        return rgb_spec

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        raise

def process_directory_to_dataframe(parent_directory, target_shape=(224, 224)):

    data, labels = [], []
    for root, _, files in os.walk(parent_directory):
        label = os.path.basename(root)  
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                try:
                    spectrogram = preprocess_audio_file(file_path, target_shape)
                    if spectrogram is not None:
                        data.append(spectrogram)
                        labels.append(label)
                except Exception as e:
                    print(f"Skipping file {file_path} due to error: {e}")

    print(f"Processed {len(data)} files successfully.")
    return np.array(data), np.array(labels)

parent_directory = "Animal_Sounds"  

print("Processing audio files...")
X, y = process_directory_to_dataframe(parent_directory)

print("Encoding labels...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

## logistic regression classifier

### Implement and Train the logistic regression classifier
discuss what this cell does as a process

In [None]:
# Load pre-trained MobileNet model
print("Loading pre-trained MobileNet model...")
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a feature extractor from the pre-trained model
feature_extractor_model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu')  
])

# Data augmentation and training
datagen = ImageDataGenerator(rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True)
datagen.fit(X_train)

# Extract features for training and testing data
print("Extracting features using the pre-trained MobileNet model...")
X_train_features = feature_extractor_model.predict(X_train)
X_test_features = feature_extractor_model.predict(X_test)

# Train Logistic Regression Classifier
print("Training the Logistic Regression classifier...")
lr_model = LogisticRegression(
    max_iter=1000,  
    random_state=42,
    multi_class='multinomial',  
    solver='lbfgs'              
)
lr_model.fit(X_train_features, np.argmax(y_train, axis=1))

# Evaluate the Model
print("Evaluating the Logistic Regression classifier...")
y_pred = lr_model.predict(X_test_features)

# Classification Report
print("Classification Report:")
print(classification_report(np.argmax(y_test, axis=1), y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(np.argmax(y_test, axis=1), y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

### Logistic Regression Analysis

talk about how it did and why we think it did well or poorly given our dataset and the strengths and weaknesses of logistic regression

## RNN classifier

### Feature Extraction

talk about what this cell does as a process

In [None]:
# Load pre-trained MobileNet model
print("Loading pre-trained MobileNet model...")
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a feature extractor from the pre-trained model
feature_extractor_model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu')  # Feature embedding layer
])

# Data augmentation and training
datagen = ImageDataGenerator(rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True)
datagen.fit(X_train)

# Extract features for training and testing data
print("Extracting features using the pre-trained MobileNet model...")
X_train_features = feature_extractor_model.predict(X_train)
X_test_features = feature_extractor_model.predict(X_test)

# Reshape features for RNN input
timesteps = 1  
X_train_rnn = X_train_features.reshape(X_train_features.shape[0], timesteps, -1)
X_test_rnn = X_test_features.reshape(X_test_features.shape[0], timesteps, -1)

# Build the RNN model
print("Building the CNN + RNN hybrid model...")

rnn_model = Sequential([
    TimeDistributed(Dense(128, activation='relu'), input_shape=(timesteps, X_train_rnn.shape[2])),
    LSTM(128, return_sequences=False),  # LSTM for temporal modeling
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')  # Classification layer
])

# Compile the model
optimizer = AdamW(learning_rate=0.001, weight_decay=1e-4)
rnn_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
rnn_model.summary()

### Train the RNN classifier

In [None]:
# Train the model
print("Training the CNN + RNN hybrid model...")

# Early Stopping to reduce overfitting by stopping training when validation loss does not change for 5 epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Learning Rate Scheduler to reduce LR when validation loss plateaus
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',  # Monitor validation loss
    factor=0.5,          # Reduce LR by a factor of 0.5
    patience=3,          # Wait 3 epochs before reducing LR
    verbose=1,
    min_lr=1e-6          # Minimum learning rate
)

history = rnn_model.fit(
    X_train_rnn, y_train,
    validation_data=(X_test_rnn, y_test),
    epochs=50,
    callbacks=[early_stopping,lr_scheduler],
    verbose=1
)

# Evaluate the model
test_loss, test_accuracy = rnn_model.evaluate(X_test_rnn, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

y_pred_probs = rnn_model.predict(X_test_rnn)  # Predicted probabilities
y_pred = np.argmax(y_pred_probs, axis=1)  # Predicted classes
y_true = np.argmax(y_test, axis=1)  # True classes

# Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

### RNN Analysis

talk about how it did and why we think it did well or poorly given our dataset and the strengths and weaknesses of rnns

## SVM classifier

### Feature Extraction and Training

In [None]:
# Load pre-trained MobileNet model
print("Loading pre-trained MobileNet model...")
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a feature extractor from the pre-trained model
feature_extractor_model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu')  # Feature embedding layer
])

# Data augmentation and training
datagen = ImageDataGenerator(rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True)
datagen.fit(X_train)

# Extract features for training and testing data
print("Extracting features using the pre-trained MobileNet model...")
X_train_features = feature_extractor_model.predict(X_train)
X_test_features = feature_extractor_model.predict(X_test)

# Train-Test Split after PCA
print("Training the SVM classifier...")

# Train SVM
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
svm_classifier.fit(X_train_features, np.argmax(y_train, axis=1))

# Evaluate SVM
print("Evaluating the SVM classifier...")
y_pred = svm_classifier.predict(X_test_features)

# Classification Report
print("Classification Report:")
print(classification_report(np.argmax(y_test, axis=1), y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(np.argmax(y_test, axis=1), y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

### SVM Analysis

talk about how it did and why we think it did well or poorly given our dataset and the strengths and weaknesses of SVMs

## XGBoost classifier

### Feature Extraction and Training

In [None]:
# Load pre-trained MobileNet model
print("Loading pre-trained MobileNet model...")
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Create a feature extractor from the pre-trained model
feature_extractor_model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu')  # Feature embedding layer
])

# Data augmentation and training
datagen = ImageDataGenerator(rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True)
datagen.fit(X_train)

# Extract features for training and testing data
print("Extracting features using the pre-trained MobileNet model...")
X_train_features = feature_extractor_model.predict(X_train)
X_test_features = feature_extractor_model.predict(X_test)

# Apply PCA
print("Applying PCA to extracted features...")
pca = PCA(n_components=0.95) 
X_train_pca = pca.fit_transform(X_train_features)
X_test_pca = pca.transform(X_test_features)

# Check the number of components selected
print(f"Number of components selected by PCA: {pca.n_components_}")

# Train XGBoost Classifier
print("Training the XGBoost classifier...")
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Multiclass classification
    num_class=y_train.shape[1],  # Number of classes
    use_label_encoder=False,     # Suppress warnings
    eval_metric='mlogloss',      # Log loss for multiclass
    random_state=42
)

xgb_model.fit(X_train_features, np.argmax(y_train, axis=1))

# Evaluate the Model
print("Evaluating the XGBoost classifier...")
y_pred = xgb_model.predict(X_test_features)

# Classification Report
print("Classification Report:")
print(classification_report(np.argmax(y_test, axis=1), y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(np.argmax(y_test, axis=1), y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

### XGBoost Analysis

talk about how it did and why we think it did well or poorly given our dataset and the strengths and weaknesses of XGBoost

## CNN as feature extractor and Various Classifiers results

put it all together in a table for easy viewing. proabably accuracy as the metric

# Discussion

discuss overall what we saw with all the different models and why some did better than others. also talk about the three methods of feature extraction and why we think the cnn feature extractor did the best. talk about how we could improve the models and what we would do differently if we had more time

# Limitations

compute power (could try bigger and more complex models with more compute), time (more time we could try different shit), dataset size (more data = more better)

# Conclusion

wrap everything up high level, basically restating the abstract in different words. talk about future directions with this

# References

just cite some links to a paper about each model (like how it works or whatever) so it can look good cuz we'd have like 8 sources 