In [3]:
# basic
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# audio
import librosa, librosa.display
from IPython.display import Audio
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

# XGBoost
from xgboost import XGBClassifier
# Stacking Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor


In [2]:
b_df = pd.read_pickle("./bird_voice.pkl")
b_df.head()

Unnamed: 0,common_name,scientific_name,xc_id,label,class,recording_length_seconds,audio_array
0,Common Ostrich,Struthio camelus australis,XC516153,Andean Guan,0,53,"[[[[0.18739352], [0.3946095], [0.5361222], [0...."
1,Common Ostrich,Struthio camelus,XC208209,Andean Guan,0,26,"[[[[0.], [0.21585666], [0.30741704], [0.299312..."
2,Common Ostrich,Struthio camelus,XC208128,Andean Guan,0,4,"[[[[0.25650588], [0.0904443], [0.14951989], [0..."
3,Common Ostrich,Struthio camelus,XC46725,Andean Guan,0,11,"[[[[0.27771807], [0.0697942], [0.1943301], [0...."
4,Common Ostrich,Struthio camelus,XC675445,Andean Guan,0,107,"[[[[0.2629077], [0.3041144], [0.41945505], [0...."


In [5]:
# Function to preprocess the spectrograms
def preprocess_audio(audio, sr, target_shape=(128, 128)):
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

    # Add a channel dimension (for grayscale: 1 channel)
    log_spectrogram = np.expand_dims(log_spectrogram, axis=-1)  # Shape (height, width, 1)

    # Resize the spectrogram to the target shape (e.g., 128x128)
    log_spectrogram_resized = tf.image.resize(log_spectrogram, target_shape)

    # Normalize to [0, 1]
    log_spectrogram_resized = (log_spectrogram_resized - np.min(log_spectrogram_resized)) / (np.max(log_spectrogram_resized) - np.min(log_spectrogram_resized))

    return log_spectrogram_resized

In [34]:
# Build the CNN model
def build_cnn_model(input_shape=(128, 128, 1)):
    model = Sequential()
    
    # 1st Convolutional Block
    model.add(Conv2D(50, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    
    # 2nd Convolutional Block
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    
    # 3rd Convolutional Block
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    
    # Flatten and Fully Connected Layers
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(46, activation='softmax'))  # Change the number of classes as needed
    
    # Compile the model
    # Since below using one hot encoding, so change loss function from sparse to CategoricalCrossentropy
    model.compile(optimizer='adam', loss='CategoricalCrossentropy', metrics=['accuracy']) 

    
    return model


In [7]:
features = b_df['audio_array']
labels = b_df['class']

In [8]:
labels_ohe = pd.get_dummies(labels, columns=['class'], dtype=float)
labels_ohe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
print('features shape:',features.shape)
print('labels shape:',labels_ohe.shape)

features shape: (1379,)
labels shape: (1379, 46)


In [10]:
feat_list = np.empty((len(features), 128, 128, 1))  # Assuming 128x128x1 shape for each feature
for i, feature in enumerate(features):
    feat_list[i] = feature

feat_list.shape

(1379, 128, 128, 1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(feat_list, labels_ohe, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [12]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(827, 128, 128, 1)
(827, 46)
(276, 128, 128, 1)
(276, 46)
(276, 128, 128, 1)
(276, 46)


In [13]:
model = build_cnn_model()
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
model.fit(X_train, y_train, epochs=50, batch_size=32,validation_data=(X_val, y_val))

Epoch 1/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 219ms/step - accuracy: 0.0177 - loss: 3.8554 - val_accuracy: 0.0290 - val_loss: 3.8304
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 213ms/step - accuracy: 0.0308 - loss: 3.8246 - val_accuracy: 0.0181 - val_loss: 3.8263
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 214ms/step - accuracy: 0.0329 - loss: 3.8126 - val_accuracy: 0.0254 - val_loss: 3.8222
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 204ms/step - accuracy: 0.0545 - loss: 3.7361 - val_accuracy: 0.0362 - val_loss: 3.7754
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 203ms/step - accuracy: 0.0745 - loss: 3.6259 - val_accuracy: 0.0688 - val_loss: 3.6669
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 206ms/step - accuracy: 0.1176 - loss: 3.5146 - val_accuracy: 0.1087 - val_loss: 3.4384
Epoch 7/50
[1m26/26[0m [3

<keras.src.callbacks.history.History at 0x1f12003d870>

In [15]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
loss_v, accuracy_v = model.evaluate(X_val, y_val, verbose=1)
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.6694 - loss: 2.1795
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.6948 - loss: 1.9663
Validation: accuracy = 0.677536  ;  loss_v = 1.964250
Test: accuracy = 0.659420  ;  loss = 2.265621


In [16]:
y_pred = model.predict(X_test)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step


In [17]:
# Convert probabilities to one-hot encoded labels (example using thresholding)
threshold = 0.5
y_pred = np.where(y_pred > threshold, 1, 0)  # 1 for probabilities above threshold, 0 otherwise

# Now use classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      0.44      0.62         9
           2       0.33      0.12      0.18         8
           3       0.40      0.80      0.53         5
           4       1.00      1.00      1.00         2
           5       1.00      0.60      0.75         5
           6       0.67      0.80      0.73         5
           7       0.67      0.25      0.36         8
           8       1.00      0.29      0.44         7
           9       1.00      1.00      1.00         5
          10       0.83      1.00      0.91         5
          11       1.00      0.86      0.92         7
          12       0.75      0.75      0.75         4
          13       0.75      0.75      0.75         4
          14       1.00      0.62      0.77         8
          15       1.00      0.50      0.67         6
          16       1.00      1.00      1.00         5
          17       0.50    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# XGBoost

In [18]:
# !pip install xgboost

In [19]:
# Since xgboostModel asks for 2 dimensional data, so I reshape input data to 2D
# 
XGB_X_train = X_train.reshape(827, 16384)
XGB_X_val = X_val.reshape(276, 16384)

In [None]:
xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
xgboostModel.fit(XGB_X_train, y_train)
predicted = xgboostModel.predict(XGB_X_train)

In [21]:
print('Traing Score: ',xgboostModel.score(XGB_X_train,y_train))
print('Testing Score: ',xgboostModel.score(XGB_X_val,y_val))

Traing Score:  0.9879081015719468
Testing Score:  0.5253623188405797


# RNN