In [None]:
#imports
import os
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import mixed_precision

In [None]:
# 2. Configuration
mixed_precision.set_global_policy('mixed_float16')

DATA_DIR = '/content/drive/MyDrive/birdclef-2025/train_audio'
CSV_PATH = '/content/train.xlsx'
SAVE_DIR = '/content/drive/MyDrive/birdclef-2025/spectrograms'  # <-- Save spectrograms here
SAMPLE_RATE = 32000
DURATION = 5
N_MELS = 128
SEED = 42
MODEL_SAVE_PATH = 'best_model.keras'

os.makedirs(SAVE_DIR, exist_ok=True)

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [9]:
# 3. Load Metadata and Filter Species
df = pd.read_excel(CSV_PATH)
species_counts = df['primary_label'].value_counts()
eligible_species = species_counts.head(10).index  # Limit to top 50 most common species
df = df[df['primary_label'].isin(eligible_species)]

In [10]:
# 4. Sample a Subset
species_list = df['primary_label'].unique().tolist()
species_to_idx = {species: idx for idx, species in enumerate(species_list)}
idx_to_species = {idx: df[df['primary_label'] == species]['common_name'].values[0] for species, idx in species_to_idx.items()}
df['label_idx'] = df['primary_label'].map(species_to_idx)

# Bootstrapping
bootstrapped_samples = []
n_samples_per_species = 500
for label in df['primary_label'].unique():
    species_files = df[df['primary_label'] == label]
    resampled_species = species_files.sample(n=n_samples_per_species, replace=True, random_state=SEED)
    bootstrapped_samples.append(resampled_species)
bootstrapped_files = pd.concat(bootstrapped_samples).reset_index(drop=True)

In [11]:
#6. Audio Preprocessing
def load_audio(path, duration=DURATION, sr=SAMPLE_RATE):
    y, _ = librosa.load(path, sr=sr)
    if len(y) < sr * duration:
        padding = sr * duration - len(y)
        y = np.pad(y, (0, padding))
    else:
        y = y[:sr * duration]
    return y

def audio_to_melspectrogram(y, sr=SAMPLE_RATE, n_mels=N_MELS):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

def add_noise(y, noise_factor=0.005):
    noise = np.random.randn(len(y))
    augmented_y = y + noise_factor * noise
    return np.clip(augmented_y, -1.0, 1.0)

def time_shift(y, shift_max=0.2):
    shift = np.random.randint(SAMPLE_RATE * DURATION * shift_max)
    direction = np.random.choice([-1, 1])
    shift = direction * shift
    augmented_y = np.roll(y, shift)
    return augmented_y

def spec_augment(mel, time_mask_width=10, freq_mask_width=8):
    mel = mel.copy()
    num_mel_channels, num_time_steps = mel.shape

    # Time mask
    t = np.random.randint(0, time_mask_width)
    t0 = np.random.randint(0, num_time_steps - t)
    mel[:, t0:t0 + t] = 0

    # Frequency mask
    f = np.random.randint(0, freq_mask_width)
    f0 = np.random.randint(0, num_mel_channels - f)
    mel[f0:f0 + f, :] = 0

    return mel

In [12]:
# 7. Dataset Preparation
X = []
y = []

print("\nLoading audio and generating/loading spectrograms with augmentation...")
for i, row in tqdm(bootstrapped_files.iterrows(), total=len(bootstrapped_files)):
    filename_base = row['filename'].replace('/', '_').replace('.ogg', '')
    mel_path = os.path.join(SAVE_DIR, filename_base + '.npy')

    if os.path.exists(mel_path):
        mel = np.load(mel_path)
    else:
        file_path = os.path.join(DATA_DIR, row['filename'])
        if os.path.exists(file_path):
            y_audio = load_audio(file_path)
            if random.random() < 0.5:
                y_audio = add_noise(y_audio)
            if random.random() < 0.5:
                y_audio = time_shift(y_audio)
            mel = audio_to_melspectrogram(y_audio)
            np.save(mel_path, mel)

    if random.random() < 0.5:
        mel = spec_augment(mel)

    X.append(mel)
    y.append(row['label_idx'])

X = np.array(X)
y = np.array(y)

X = X[..., np.newaxis]



Loading audio and generating/loading spectrograms with augmentation...


100%|██████████| 5000/5000 [17:46<00:00,  4.69it/s]


In [13]:
# 9. CNN Model
def create_cnn(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.GlobalAveragePooling2D(),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

model = create_cnn(X_train.shape[1:], len(species_list))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# 8. Stratified K-Fold Cross Validation
from sklearn.model_selection import StratifiedKFold

NUM_FOLDS = 5
kf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)
fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold + 1} ---")

    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32).prefetch(tf.data.AUTOTUNE)

    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

    model = create_cnn(X_train.shape[1:], len(species_list))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True),
        ModelCheckpoint(f'model_fold{fold+1}.keras', monitor='val_loss', save_best_only=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
    ]

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=50,
        callbacks=callbacks,
        verbose=1
    )

    # Evaluation per fold
y_pred = model.predict(val_dataset)
y_pred_classes = np.argmax(y_pred, axis=1)

print("\nFold Classification Report:\n")
print(classification_report(y_val, y_pred_classes, target_names=[idx_to_species[idx] for idx in range(len(species_list))]))

fold_acc = np.mean(y_pred_classes == y_val)
fold_accuracies.append(fold_acc)
print(f"Fold {fold+1} Accuracy: {fold_acc:.4f}")

# 9. Final Results
print("\nCross-Validation Results:")
print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Accuracy per fold: {fold_accuracies}")


--- Fold 1 ---
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.1225 - loss: 4.8443 - val_accuracy: 0.0980 - val_loss: 2.7622 - learning_rate: 0.0010
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.1226 - loss: 2.3543 - val_accuracy: 0.1050 - val_loss: 2.3083 - learning_rate: 0.0010
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.0309 - loss: 2.3589 - val_accuracy: 0.1000 - val_loss: 2.3039 - learning_rate: 0.0010
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.0279 - loss: 2.3330 - val_accuracy: 0.1010 - val_loss: 2.3058 - learning_rate: 0.0010
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.1342 - loss: 2.1782 - val_accuracy: 0.1000 - val_loss: 2.6761 - learning_rate: 0.0010
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
def generate_grad_cam(model, img_array, pred_index=None):
    grad_model = tf.keras.models.Model([
        model.inputs], [model.get_layer(index=-3).output, model.output])

    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array)
        if pred_index is None:
            pred_index = tf.argmax(predictions[0])
        class_channel = predictions[:, pred_index]

    grads = tape.gradient(class_channel, conv_outputs)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
    conv_outputs = conv_outputs[0]
    heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
    heatmap = tf.squeeze(heatmap)
    heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
    return heatmap.numpy()

def display_grad_cam(X_val, y_val, model, idx_to_species, num_samples=5):
    indices = random.sample(range(len(X_val)), num_samples)
    for idx in indices:
        img = X_val[idx]
        label = y_val[idx]

        img_array = np.expand_dims(img, axis=0)
        heatmap = generate_grad_cam(model, img_array)

        plt.figure(figsize=(10, 4))
        librosa.display.specshow(img.squeeze(), sr=SAMPLE_RATE, x_axis='time', y_axis='mel')
        plt.imshow(heatmap, alpha=0.5, cmap='jet', extent=(0, img.shape[1], 0, img.shape[0]), aspect='auto')
        plt.title(f"Grad-CAM for True Label: {idx_to_species[label]}")
        plt.colorbar()
        plt.show()

print("\nGenerating Grad-CAM visualizations...")
display_grad_cam(X_val, y_val, model, idx_to_species)


# 13. Binary Confusion Matrix per Species (NEW)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def binary_confusion_matrix(species_idx, y_true, y_pred, idx_to_species):
    y_true_binary = (y_true == species_idx).astype(int)
    y_pred_binary = (y_pred == species_idx).astype(int)
    cm = confusion_matrix(y_true_binary, y_pred_binary)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Other', idx_to_species[species_idx]])
    disp.plot(cmap='Blues')
    plt.title(f'Binary Confusion Matrix: {idx_to_species[species_idx]} vs Rest')
    plt.show()

# Example usage:
# binary_confusion_matrix(species_idx=0, y_true=y_val, y_pred=y_pred_classes, idx_to_species=idx_to_species)


In [None]:
# 14. Inference Function
def predict_species(file_path, model, species_list):
    y_audio = load_audio(file_path)
    mel = audio_to_melspectrogram(y_audio)
    mel = mel[np.newaxis, ..., np.newaxis]
    preds = model.predict(mel)[0]
    top_indices = preds.argsort()[-5:][::-1]
    top_species = [(species_list[i], preds[i]) for i in top_indices]
    return top_species

# Example:
# file_path = '/path/to/new/audio.ogg'
# predictions = predict_species(file_path, model, species_list)
# print(predictions)