In [2]:
import pandas as pd
import os

directory = "segment_audio"
dataset_files = [os.path.join(directory, subdir, file) for subdir in os.listdir(directory) for file in os.listdir(os.path.join(directory, subdir))]

In [9]:
import json

with open("map.txt","r") as file:
    map = file.read()
map = json.loads(map)
map_class_to_idx = {cl:idx for idx,cl in map.items()}

In [11]:
import random 

proxy = random.sample(dataset_files, 700)
classes = []
idx = []
file_paths = []
for file_path in proxy:
    class_name = os.path.basename(os.path.dirname(file_path))
    classes.append(class_name)
    idx.append(map_class_to_idx[class_name])
    file_paths.append(file_path)

In [12]:
file_paths

['segment_audio/compau/XC240013_segment_5.wav',
 'segment_audio/whtdov/XC635817_segment_1.wav',
 'segment_audio/bubwre1/XC654364_segment_7.wav',
 'segment_audio/trsowl/iNat692065_segment_11.wav',
 'segment_audio/speowl1/XC238949_segment_16.wav',
 'segment_audio/bubwre1/XC395982_segment_7.wav',
 'segment_audio/colcha1/XC716443_segment_25.wav',
 'segment_audio/41663/iNat1127617_segment_14.wav',
 'segment_audio/whtdov/XC703709_segment_50.wav',
 'segment_audio/grekis/iNat1004218_segment_3.wav',
 'segment_audio/yebsee1/XC844246_segment_1.wav',
 'segment_audio/banana/XC196936_segment_8.wav',
 'segment_audio/grbhaw1/XC123648_segment_21.wav',
 'segment_audio/crcwoo1/XC418618_segment_10.wav',
 'segment_audio/speowl1/XC75051_segment_10.wav',
 'segment_audio/yeofly1/XC479384_segment_20.wav',
 'segment_audio/yecspi2/XC648685_segment_7.wav',
 'segment_audio/ywcpar/XC345028_segment_4.wav',
 'segment_audio/palhor2/XC298940_segment_1.wav',
 'segment_audio/yebsee1/XC198798_segment_2.wav',
 'segment_aud

In [None]:
import torch
import torchaudio
#from birdset import ConvNextBirdSet
from bird_model import BirdsetModule  # assuming this is the training script filename
from torchaudio.transforms import Resample
from sklearn.metrics import roc_auc_score
import os

def load_model(checkpoint_path, num_classes=206, device='cpu'):
    model = BirdsetModule(num_classes=num_classes).to(device)
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.eval()
    return model

def preprocess_audio(audio_path, target_sample_rate=32000):
    waveform, sample_rate = torchaudio.load(audio_path)

    if sample_rate != target_sample_rate:
        resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # convert to mono
    return waveform.squeeze(0)  # shape: [time]

def infer_single_audio(model, audio_tensor, device='cpu'):
    model.eval()
    with torch.no_grad():
        audio_tensor = audio_tensor.to(device)
        logits = model(audio_tensor.unsqueeze(0))  # add batch dim
        probs = torch.softmax(logits, dim=1)
        #top_prob, top_idx = torch.max(probs, dim=1)
        return probs#top_idx.item(), top_prob.item(), probs.squeeze(0).cpu().numpy()  # return full probs for AUCROC

def main(file_paths, ground_truths):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint_path = "checkpoints/birdset_epoch3.pt"
    model = load_model(checkpoint_path, device=device)  

    y_true = []
    y_scores = []

    for audio_path, true_label in zip(file_paths, ground_truths):
        audio_tensor = preprocess_audio(audio_path)
        probs = infer_single_audio(model, audio_tensor, device=device)
        prob_vector = probs.squeeze(0).cpu().numpy()

        # One-hot true label vector
        y_true.append([1 if i == true_label else 0 for i in range(len(prob_vector))])
        y_scores.append(prob_vector)

        print(f"Predicted class: {prob_vector.argmax()}, Confidence: {prob_vector.max():.4f}, True class: {true_label}")

    if len(y_true) > 1:  # only compute if more than 1 sample
        from sklearn.metrics import roc_auc_score
        auc = roc_auc_score(
            y_true, y_scores, average='macro', multi_class='ovr'
        )
        print(f"\nAUC-ROC Score: {auc:.4f}")
    else:
        print("Not enough samples to compute ROC AUC (need >1).")


if __name__ == "__main__":
    main(file_paths=file_paths, ground_truths=idx)




Some weights of ConvNextForImageClassification were not initialized from the model checkpoint at DBD-research-group/ConvNeXT-Base-BirdSet-XCL and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9736]) in the checkpoint and torch.Size([206]) in the model instantiated
- classifier.weight: found shape torch.Size([9736, 1024]) in the checkpoint and torch.Size([206, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class: 97, Confidence: 1.0000, True class: 97
Predicted class: 190, Confidence: 0.9966, True class: 190
Predicted class: 84, Confidence: 0.9974, True class: 84
Predicted class: 181, Confidence: 0.3033, True class: 181
Predicted class: 168, Confidence: 0.9470, True class: 168
Predicted class: 84, Confidence: 0.9634, True class: 84
Predicted class: 96, Confidence: 0.9859, True class: 96
Predicted class: 19, Confidence: 0.9984, True class: 19
Predicted class: 190, Confidence: 0.9586, True class: 190
Predicted class: 113, Confidence: 0.9879, True class: 113
Predicted class: 197, Confidence: 0.8405, True class: 197
Predicted class: 66, Confidence: 1.0000, True class: 66
Predicted class: 109, Confidence: 0.8921, True class: 109
Predicted class: 101, Confidence: 1.0000, True class: 101
Predicted class: 168, Confidence: 0.9992, True class: 168
Predicted class: 203, Confidence: 0.9999, True class: 203
Predicted class: 198, Confidence: 0.9998, True class: 198
Predicted class: 205, Conf

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [30]:
import torch
import torchaudio
#from birdset import ConvNextBirdSet
from bird_model import BirdsetModule  # assuming this is the training script filename
from torchaudio.transforms import Resample
import os

def load_model(checkpoint_path, num_classes=206, device='cpu'):
    model = BirdsetModule(num_classes=num_classes).to(device)
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.eval()
    return model

def preprocess_audio(audio_path, target_sample_rate=32000):
    waveform, sample_rate = torchaudio.load(audio_path)

    if sample_rate != target_sample_rate:
        resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    # shape: [1, time] -> remove channel if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # convert to mono
    return waveform.squeeze(0)  # [time]

def infer_single_audio(model, audio_tensor, device='cpu', top_k=5):
    model.eval()
    with torch.no_grad():
        audio_tensor = audio_tensor.to(device)
        logits = model(audio_tensor.unsqueeze(0))  # add batch dim
        probs = torch.softmax(logits, dim=1)
        top_probs, top_indices = torch.topk(probs, k=top_k)
        return top_indices.squeeze(0), top_probs.squeeze(0)

def main(file_paths, ground_truths):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint_path = "checkpoints/birdset_epoch3.pt"
    model = load_model(checkpoint_path, device=device)  

    y_true = []
    y_scores = []

    for audio_path, true_label in zip(file_paths, ground_truths):
        audio_tensor = preprocess_audio(audio_path)
        indices,probs = infer_single_audio(model, audio_tensor, device=device)
        print(indices,probs)
        prob_vector = probs.squeeze(0).cpu().numpy()

        # One-hot true label vector
        y_true.append([1 if i == true_label else 0 for i in range(len(prob_vector))])
        y_scores.append(prob_vector)

        print(f"Predicted class: {prob_vector.argmax()}, Confidence: {prob_vector.max():.4f}, True class: {true_label}")
        
    if len(y_true) > 1:  # only compute if more than 1 sample
        from sklearn.metrics import roc_auc_score
        auc = roc_auc_score(
            y_true, y_scores, average='macro', multi_class='ovr'
        )
        print(f"\nAUC-ROC Score: {auc:.4f}")
    else:
        print("Not enough samples to compute ROC AUC (need >1).")


if __name__ == "__main__":
    main(file_paths=file_paths, ground_truths=idx)

Some weights of ConvNextForImageClassification were not initialized from the model checkpoint at DBD-research-group/ConvNeXT-Base-BirdSet-XCL and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9736]) in the checkpoint and torch.Size([206]) in the model instantiated
- classifier.weight: found shape torch.Size([9736, 1024]) in the checkpoint and torch.Size([206, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([ 97,  98, 181, 172, 114], device='cuda:0') tensor([9.9999e-01, 4.3750e-06, 9.5475e-07, 3.7356e-07, 1.0508e-07],
       device='cuda:0')
Predicted class: 0, Confidence: 1.0000, True class: 97
tensor([190, 132,  80, 124, 203], device='cuda:0') tensor([9.9665e-01, 6.9529e-04, 4.1019e-04, 2.4513e-04, 2.0689e-04],
       device='cuda:0')
Predicted class: 0, Confidence: 0.9966, True class: 190
tensor([ 84, 171,  70, 186, 136], device='cuda:0') tensor([9.9744e-01, 6.6111e-04, 2.5077e-04, 2.1998e-04, 2.0753e-04],
       device='cuda:0')
Predicted class: 0, Confidence: 0.9974, True class: 84
tensor([181, 190,  61, 161,  97], device='cuda:0') tensor([0.3033, 0.1975, 0.1537, 0.0663, 0.0371], device='cuda:0')
Predicted class: 0, Confidence: 0.3033, True class: 181
tensor([168,  97, 175, 114,  98], device='cuda:0') tensor([0.9470, 0.0168, 0.0156, 0.0136, 0.0028], device='cuda:0')
Predicted class: 0, Confidence: 0.9470, True class: 168
tensor([ 84,  70,  74, 155, 148], device='cuda:0') tenso

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.