In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import torch.nn as nn
from transformers import AutoModel, AutoModelForImageClassification, AutoImageProcessor, AutoConfig
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, get_cosine_schedule_with_warmup
import torch.nn.functional as F
import torchaudio

from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

import random
import matplotlib.pyplot as plt
from pathlib import Path

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

NUM_CLASSES = 207

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# dataloader
class BirdTrainDatasetPrecomputed(Dataset):
    def __init__(self, counts_df, labels_df, data_path='data/precomputed_spectrograms/spectrograms', use_cutmix=True, use_masking=True, num_classes = 206, sample_random_ms = False):
        self.path = data_path
        self.use_cutmix = use_cutmix
        self.use_masking = use_masking
        self.num_classes = num_classes
        self.sample_random_ms = sample_random_ms
        self.labels_df_indexed = labels_df.set_index('file_path')
        self.labels_df = labels_df
        self.counts_df = counts_df

    def __len__(self):
        if self.sample_random_ms:
            return len(self.counts_df)
        return len(self.labels_df) 

    def __getitem__(self, idx):
        path, label = self.get_path_and_label(idx)
        spec = torch.load(path)

        if self.use_cutmix and random.random() < 0.5:
            mix_path, mix_label = self.get_path_and_label(-1)
            mix_spec = torch.load(mix_path)

            if self.use_masking:
                spec = self.xy_masking(spec)
                mix_spec = self.xy_masking(mix_spec)

            spec, label = self.horizontal_cutmix(spec, label, mix_spec, mix_label)

        else:
            if self.use_masking:
                spec = self.xy_masking(spec)
            label = F.one_hot(torch.tensor(label), self.num_classes).float()

        return {
            "pixel_values": spec,
            "labels": label,
            "file_name": str(path),
            "index": idx
        }

    def get_path_and_label(self, idx = -1):
        if idx == -1:
            idx = random.randint(0, self.__len__() - 1)
        
        if self.sample_random_ms:
            dir_path = Path(self.counts_df.iloc[idx]['file_path'])
            count = self.counts_df.iloc[idx]['count']
            filename = random.randint(0, count - 1)
            path = dir_path / f"{filename}.pt"
            label = self.labels_df_indexed.loc[str(path)]['label']
            return path, label
        else:
            return self.labels_df.iloc[idx]['file_path'], self.labels_df.iloc[idx]['label']

    def xy_masking(self, spec, num_x_masks=2, num_y_masks=1, max_width=10, max_height=10):
        """
        Applies vertical (x) and horizontal (y) rectangular zero-masks to the spectrogram.
        """
        cloned = spec.clone()
        _, height, width = cloned.shape

        # Apply x-masks (vertical)
        for _ in range(num_x_masks):
            w = random.randint(1, max_width)
            x = random.randint(0, max(0, width - w))
            cloned[:, :, x:x+w] = 0.0

        # Apply y-masks (horizontal)
        for _ in range(num_y_masks):
            h = random.randint(1, max_height)
            y = random.randint(0, max(0, height - h))
            cloned[:, y:y+h, :] = 0.0

        return cloned

    def horizontal_cutmix(self, spec1, label1, spec2, label2, alpha=1.0):
        """
        Mix two spectrograms horizontally (along the time axis),
        and create soft labels using torch.nn.functional.one_hot.
        """
        _, h, w = spec1.shape
        cut_point = random.randint(int(0.3 * w), int(0.7 * w))
        lam = cut_point / w

        # Concatenate spectrograms along the time axis (width)
        new_spec = torch.cat((spec1[:, :, :cut_point], spec2[:, :, cut_point:]), dim=2)

        # Convert scalar labels to one-hot vectors
        label1_onehot = F.one_hot(torch.tensor(label1), num_classes=self.num_classes).float()
        label2_onehot = F.one_hot(torch.tensor(label2), num_classes=self.num_classes).float()

        # Mix the labels
        mixed_label = lam * label1_onehot + (1 - lam) * label2_onehot

        return new_spec, mixed_label


labels_df = pd.read_csv('/home/andy/Desktop/BirdClef/customSED/data/precomputed_spectrograms/labels.csv')
counts_df = pd.read_csv('/home/andy/Desktop/BirdClef/customSED/data/precomputed_spectrograms/counts.csv')

full_ds = BirdTrainDatasetPrecomputed(
    counts_df=counts_df,
    labels_df=labels_df,
    data_path='/home/andy/Desktop/BirdClef/customSED/data/precomputed_spectrograms/spectrograms',
    use_cutmix=False,
    use_masking=False,
    num_classes=NUM_CLASSES,
    sample_random_ms=False
)

print(f"Full dataset size: {len(full_ds)}")

Full dataset size: 148905


In [3]:
# load model ensemble
paths = [
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold0/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold1/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold2/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold3/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold4/checkpoint-12000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold0/checkpoint-12000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold1/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold2/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold3/checkpoint-14000",
  "/home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold4/checkpoint-13000",
]

models = []
for path in paths:
  model = AutoModelForImageClassification.from_pretrained(path)
  model.eval()
  model.to("cpu")
  models.append(model)
  print(f"loaded model {path}")

loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold0/checkpoint-14000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold1/checkpoint-14000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold2/checkpoint-14000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold3/checkpoint-14000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/facebook/regnet-y-008_fold4/checkpoint-12000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold0/checkpoint-12000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold1/checkpoint-14000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold2/checkpoint-14000
loaded model /home/andy/Desktop/BirdClef/customSED/ensemble_runs/google/efficientnet-b2_fold3/checkpoint-14000
loaded

In [4]:
# loop through models, throw on gpu, runs inference and create a df of logits. Do for all models
models_logits = []
batch_size = 400
N = len(full_ds)

for model in models:
    model.to(device)
    model.eval()

    # Pre‐allocate a (N, NUM_CLASSES) array for this model’s logits
    this_logits = np.zeros((N, NUM_CLASSES), dtype=np.float32)

    # Iterate over the dataset in chunks of `batch_size`
    for start_idx in tqdm(range(0, N, batch_size), desc=f"Inferencing {model.__class__.__name__}"):
        end_idx = min(start_idx + batch_size, N)
        batch_indices = list(range(start_idx, end_idx))

        # Stack all pixel_values for indices [start_idx : end_idx]
        # Each full_ds[i]['pixel_values'] is (C, H, W), so stacking makes (B, C, H, W)
        batch_tensor = torch.stack(
            [ full_ds[i]["pixel_values"] for i in batch_indices ],
            dim=0
        ).to(device)  # shape: (batch_size, C, H, W)

        with torch.no_grad():
            outputs = model(pixel_values=batch_tensor)
            logits_batch = outputs.logits.cpu().numpy()  # shape: (batch_size, NUM_CLASSES)

        # Store the logits into the correct rows of this_logits
        this_logits[start_idx:end_idx, :] = logits_batch

    # Append and free GPU memory
    models_logits.append(this_logits)
    model.to("cpu")
    torch.cuda.empty_cache()

Inferencing RegNetForImageClassification: 100%|██████████| 373/373 [03:05<00:00,  2.01it/s]
Inferencing RegNetForImageClassification: 100%|██████████| 373/373 [03:08<00:00,  1.98it/s]
Inferencing RegNetForImageClassification: 100%|██████████| 373/373 [03:03<00:00,  2.04it/s]
Inferencing RegNetForImageClassification: 100%|██████████| 373/373 [03:13<00:00,  1.93it/s]
Inferencing RegNetForImageClassification: 100%|██████████| 373/373 [03:16<00:00,  1.90it/s]
Inferencing EfficientNetForImageClassification: 100%|██████████| 373/373 [04:24<00:00,  1.41it/s]
Inferencing EfficientNetForImageClassification: 100%|██████████| 373/373 [04:37<00:00,  1.35it/s]
Inferencing EfficientNetForImageClassification: 100%|██████████| 373/373 [04:41<00:00,  1.33it/s]
Inferencing EfficientNetForImageClassification: 100%|██████████| 373/373 [04:39<00:00,  1.34it/s]
Inferencing EfficientNetForImageClassification: 100%|██████████| 373/373 [04:39<00:00,  1.34it/s]


In [5]:
print(np.array(models_logits).shape)

(10, 148905, 207)


In [None]:
start = 427
for i in range(start, start + 10):
    model = models[0]
    spec = full_ds[i]["pixel_values"].unsqueeze(0)
    with torch.no_grad():
        outputs = model(pixel_values=spec)
        logits = outputs.logits.cpu().numpy()
        pred = np.argmax(logits, axis=1)
    
    stored_pred = np.argmax(models_logits[0][i], axis=0)
    true_label = np.argmax(full_ds[i]["labels"].numpy())
    
    print(f"Model prediction for sample {i}: {pred[0]}, stored prediction: {stored_pred}, true label: {true_label}")  



In [7]:
# figure out which classes have less samples then a threshold
threshold = 8
class_counts = labels_df['label'].value_counts()
rare_classes = class_counts[class_counts < threshold].index.to_numpy()

In [8]:
# Stratgey 1: Average the logits from all models. take max label. If it does not match the label, then classify it as "no-call"

# 1.1) Average the logits and get per‐example argmax:
#stacked = np.stack(models_logits, axis=0)  # shape (n_models, N, NUM_CLASSES)
#avg_logits = np.mean(stacked, axis=0)     # shape (N, NUM_CLASSES)
#final_predictions = np.argmax(avg_logits, axis=1)  # shape (N,)
final_predictions = np.argmax(models_logits[0], axis=1)

# softmax
#softmax = np.exp(models_logits[0]) / np.sum(np.exp(models_logits[0]), axis=1, keepdims=True)

# 1.2) Pull out the true labels as a NumPy array
true_labels = labels_df['label'].to_numpy()  # shape (N,)

# 1.3) Make a copy of true_labels so we can override mismatches:
new_targets = true_labels.copy()  # shape (N,)

# 1.4) Define “no‐call” as the last index (NUM_CLASSES-1)
NO_CALL_CLASS = NUM_CLASSES - 1

# 1.5) Find where prediction ≠ true_label
mismatch_mask = (final_predictions != true_labels)

protect_rare = np.isin(true_labels, rare_classes)    # True for all i whose true label is rare
mismatch_mask[protect_rare] = False

# 1.6) Set those to NO_CALL_CLASS
new_targets[mismatch_mask] = NO_CALL_CLASS

# 1.7) Dump into a new DataFrame column and save to CSV
new_labels_df = labels_df.copy()
new_labels_df['label'] = new_targets
new_labels_df.to_csv('/home/andy/Desktop/BirdClef/customSED/data/precomputed_spectrograms/filtered_labels_base_models.csv', index=False)

In [9]:
# print overall accuracy
accuracy = accuracy_score(true_labels, final_predictions)
print(f"Overall accuracy: {accuracy:.4f}")

Overall accuracy: 0.8765


In [10]:
# View some basic ds stats
def print_label_stats(df, name="Dataset"):
    label_counts = df['label'].value_counts()
    max_count = label_counts.max()
    min_count = label_counts.min()
    mean_count = label_counts.mean()
    std_count = label_counts.std()

    print(f"{name} label statistics:")
    print(f"  Max:  {max_count}")
    print(f"  Min:  {min_count}")
    print(f"  Mean: {mean_count:.2f}")
    print(f"  Std:  {std_count:.2f}")
    print("")

# Print stats for training and validation datasets
print_label_stats(labels_df, "Labels_ds")
print_label_stats(new_labels_df, "Relabelled_ds")

# print stats for new_labels w/o no-call
no_call_mask = new_labels_df['label'] != NO_CALL_CLASS
print_label_stats(new_labels_df[no_call_mask], "Relabelled_ds (no no-call)")

Labels_ds label statistics:
  Max:  4105
  Min:  2
  Mean: 595.80
  Std:  768.75

Relabelled_ds label statistics:
  Max:  15549
  Min:  2
  Mean: 601.70
  Std:  1266.22

Relabelled_ds (no no-call) label statistics:
  Max:  3923
  Min:  2
  Mean: 527.34
  Std:  699.04

