In [2]:
# basic
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# audio
import librosa
from IPython.display import Audio
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

In [8]:
b_df = pd.read_pickle("./bird_voice.pkl")
b_df

Unnamed: 0,common_name,scientific_name,recording_length,xc_id,filename,label,file,class,recording_length_seconds,audio_array
0,Common Ostrich,Struthio camelus australis,0 days 00:00:53,XC516153,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Andean Guan,Andean Guan10.mp3,0,53,"[[[[0.18739352], [0.3946095], [0.5361222], [0...."
1,Common Ostrich,Struthio camelus,0 days 00:00:26,XC208209,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Andean Guan,Andean Guan11.mp3,0,26,"[[[[0.], [0.21585666], [0.30741704], [0.299312..."
2,Common Ostrich,Struthio camelus,0 days 00:00:04,XC208128,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Andean Guan,Andean Guan12.mp3,0,4,"[[[[0.25650588], [0.0904443], [0.14951989], [0..."
3,Common Ostrich,Struthio camelus,0 days 00:00:11,XC46725,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Andean Guan,Andean Guan13.mp3,0,11,"[[[[0.27771807], [0.0697942], [0.1943301], [0...."
4,Common Ostrich,Struthio camelus,0 days 00:01:47,XC675445,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Andean Guan,Andean Guan14.mp3,0,107,"[[[[0.2629077], [0.3041144], [0.41945505], [0...."
...,...,...,...,...,...,...,...,...,...,...
1375,Black-fronted Piping Guan,Pipile jacutinga,0 days 00:00:06,XC228155,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Yellow-legged Tinamou,Yellow-legged Tinamou5.mp3,45,6,"[[[[0.67436016], [0.6806983], [0.68085015], [0..."
1376,Black-fronted Piping Guan,Pipile jacutinga,0 days 00:00:02,XC110142,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Yellow-legged Tinamou,Yellow-legged Tinamou6.mp3,45,2,"[[[[0.67436016], [0.6806983], [0.68085015], [0..."
1377,Black-fronted Piping Guan,Pipile jacutinga,0 days 00:00:04,XC110141,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Yellow-legged Tinamou,Yellow-legged Tinamou7.mp3,45,4,"[[[[0.23947819], [0.39522928], [0.28454277], [..."
1378,Black-fronted Piping Guan,Pipile jacutinga,0 days 00:00:05,XC85349,C:\Users\aqua6\Desktop\DSML\Neural network\Pro...,Yellow-legged Tinamou,Yellow-legged Tinamou8.mp3,45,5,"[[[[0.23947819], [0.39522928], [0.28454277], [..."


In [16]:
# Function to preprocess the spectrograms
def preprocess_audio(audio, sr, target_shape=(128, 128)):
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

    # Add a channel dimension (for grayscale: 1 channel)
    log_spectrogram = np.expand_dims(log_spectrogram, axis=-1)  # Shape (height, width, 1)

    # Resize the spectrogram to the target shape (e.g., 128x128)
    log_spectrogram_resized = tf.image.resize(log_spectrogram, target_shape)

    # Normalize to [0, 1]
    log_spectrogram_resized = (log_spectrogram_resized - np.min(log_spectrogram_resized)) / (np.max(log_spectrogram_resized) - np.min(log_spectrogram_resized))

    return log_spectrogram_resized

In [20]:
# Build the CNN model
def build_cnn_model(input_shape=(128, 128, 1)):
    model = Sequential()
    
    # 1st Convolutional Block
    model.add(Conv2D(50, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    
    # 2nd Convolutional Block
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    
    # 3rd Convolutional Block
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    
    # Flatten and Fully Connected Layers
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(46, activation='softmax'))  # Change the number of classes as needed
    
    # Compile the model
    # Since below using one hot encoding, so change loss function from sparse to CategoricalCrossentropy
    model.compile(optimizer='adam', loss='CategoricalCrossentropy', metrics=['accuracy']) 

    
    return model


In [9]:
features = b_df['audio_array']
labels = b_df['class']

In [10]:
labels_ohe = pd.get_dummies(labels, columns=['class'], dtype=float)
labels_ohe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
print('features shape:',features.shape)
print('labels shape:',labels_ohe.shape)

features shape: (1379,)
labels shape: (1379, 46)


In [13]:
feat_list = np.empty((len(features), 128, 128, 1))  # Assuming 128x128x1 shape for each feature
for i, feature in enumerate(features):
    feat_list[i] = feature

feat_list.shape

(1379, 128, 128, 1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(feat_list, labels_ohe, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(827, 128, 128, 1)
(827, 46)
(276, 128, 128, 1)
(276, 46)
(276, 128, 128, 1)
(276, 46)


In [21]:
model = build_cnn_model()
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.fit(X_train, y_train, epochs=50, batch_size=32,validation_data=(X_val, y_val))

Epoch 1/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 252ms/step - accuracy: 0.0278 - loss: 3.8373 - val_accuracy: 0.0181 - val_loss: 3.8284
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 250ms/step - accuracy: 0.0266 - loss: 3.8129 - val_accuracy: 0.0326 - val_loss: 3.8041
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 241ms/step - accuracy: 0.0550 - loss: 3.7154 - val_accuracy: 0.0326 - val_loss: 3.7616
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 233ms/step - accuracy: 0.0535 - loss: 3.6599 - val_accuracy: 0.1014 - val_loss: 3.5801
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 229ms/step - accuracy: 0.1345 - loss: 3.3910 - val_accuracy: 0.1377 - val_loss: 3.3173
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 232ms/step - accuracy: 0.2015 - loss: 3.1613 - val_accuracy: 0.2790 - val_loss: 2.9367
Epoch 7/50
[1m26/26[0m [3

<keras.src.callbacks.history.History at 0x2146edebb20>

In [23]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
loss_v, accuracy_v = model.evaluate(X_val, y_val, verbose=1)
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.6760 - loss: 2.1692
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step - accuracy: 0.7101 - loss: 2.2068
Validation: accuracy = 0.699275  ;  loss_v = 2.228804
Test: accuracy = 0.677536  ;  loss = 2.216806


In [24]:
y_pred = model.predict(X_test)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step


In [25]:
# Convert probabilities to one-hot encoded labels (example using thresholding)
threshold = 0.5
y_pred = np.where(y_pred > threshold, 1, 0)  # 1 for probabilities above threshold, 0 otherwise

# Now use classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      0.67      0.71         9
           2       0.80      0.50      0.62         8
           3       0.29      0.40      0.33         5
           4       0.50      1.00      0.67         2
           5       1.00      0.60      0.75         5
           6       0.56      1.00      0.71         5
           7       1.00      0.25      0.40         8
           8       1.00      0.43      0.60         7
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         5
          11       0.75      0.86      0.80         7
          12       1.00      0.50      0.67         4
          13       1.00      0.75      0.86         4
          14       1.00      0.62      0.77         8
          15       1.00      0.50      0.67         6
          16       0.45      1.00      0.62         5
          17       0.50    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
