In [11]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import numpy as np
import os

import tensorflow_hub as hub
import tensorflow as tf

import torchaudio
import torch
from torch.utils.data import DataLoader, Dataset


df = pd.read_csv('/kaggle/input/birdclef-2024/train_metadata.csv')
AUDIO_PATH = Path('/kaggle/input/birdclef-2024/train_audio')

model_path = 'https://kaggle.com/models/google/bird-vocalization-classifier/frameworks/TensorFlow2/variations/bird-vocalization-classifier/versions/4'
model = hub.load(model_path)
model_labels_df = pd.read_csv(hub.resolve(model_path) + "/assets/label.csv")

SAMPLE_RATE = 32000
WINDOW = 5*SAMPLE_RATE

In [2]:
index_to_label = sorted(df.primary_label.unique())
label_to_index = {v: k for k, v in enumerate(index_to_label)}
model_labels = {v: k for k, v in enumerate(model_labels_df.ebird2021)}
model_bc_indexes = [model_labels[label] if label in model_labels else -1 for label in index_to_label]

# filter out birds that the model doesn't predict
missing_birds = set(np.array(index_to_label)[np.array(model_bc_indexes) == -1])
missing_birds

{'bkrfla1', 'indrol2'}

In [None]:
# Save embeddings and predictions for every 5 sec non-overlapping audio¶


# use a torch dataloader to decode audio in parallel on CPU while GPU is running

class AudioDataset(Dataset):
    def __len__(self):
        return len(df)
    def __getitem__(self, i):
        filename = df.filename[i]
        audio = torchaudio.load(AUDIO_PATH / filename)[0].numpy()[0]
        return audio, filename
        
dataloader = DataLoader(AudioDataset(), batch_size=1, num_workers=os.cpu_count())


# embeddings are formated like {"filename": np.array(nx1280)} 
all_embeddings = {}

# predictiones formated like {"filename": np.array(nx264)} 
all_predictions = {}

# (where n = the number of non overlapping 5 sec chunks in the audio)

with tf.device('/gpu:0'):
    for audio, filename in tqdm(dataloader):
        audio = audio[0]
        filename = filename[0]
        file_embeddings = []
        file_predictions = []
        for i in range(0, len(audio), WINDOW):
            clip = audio[i:i+WINDOW]
            if len(clip) < WINDOW:
                clip = np.concatenate([clip, np.zeros(WINDOW - len(clip))])
            result = model.infer_tf(clip[None, :])
            file_embeddings.append(result[1][0].numpy())
            prediction = np.concatenate([result[0].numpy(), -100], axis=None) # add -100 logit for unpredicted birds
            file_predictions.append(prediction[model_bc_indexes])
        all_embeddings[filename] = np.stack(file_embeddings)
        all_predictions[filename] = np.stack(file_predictions)
        break

torch.save(all_embeddings, 'embeddings.pt')
torch.save(all_predictions, 'predictions.pt')

In [59]:
# Scores of predictions on the first 5 seconds of each recording¶

predicted_classes = torch.tensor([row[0].argmax() for row in all_predictions.values()])
actual_classes = torch.tensor([label_to_index[label] for label in df.primary_label])
correct = predicted_classes == actual_classes
accuracy = correct.float().mean()
accuracy

tensor(0.0043)