In [None]:
import time
START = time.time()
from concurrent.futures import ThreadPoolExecutor
import glob
import librosa
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as hub
import os
tf.experimental.numpy.experimental_enable_numpy_behavior()

TERMINATE_TIME = START + 5300

# Load the trained model (h5 format)
trained_model = tf.keras.models.load_model('/kaggle/input/yamnwave/other/default/1/yamnet_species_classifier.h5')

# Load YAMNet model from TensorFlow Hub
yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
yamnet_model = hub.load(yamnet_model_handle)

# Load sample submission and taxonomy
primary_labels = pd.read_csv('/kaggle/input/birdclef-2025/sample_submission.csv').columns[1:].to_list()
primary_labels_indices = range(len(primary_labels))
primary_labels_map = dict(zip(primary_labels, primary_labels_indices))

taxonomy = pd.read_csv('/kaggle/input/birdclef-2025/taxonomy.csv', index_col='common_name')['primary_label']
taxonomy_map = taxonomy.map(primary_labels_map)
common_names = taxonomy.index.to_list()

# Assuming your model's labels are similar to the primary labels, or you have a mapping
bc_labels = primary_labels  # Or load from your model's metadata if available
bc_labels_indices = range(len(bc_labels))
primary_labels_map_bc = dict(zip(bc_labels, bc_labels_indices))
birdclassifier_last = len(bc_labels)
birdclassifier_indices = [primary_labels_map_bc[pl] if pl in primary_labels else birdclassifier_last for pl in primary_labels]

total_predicted_species = len(primary_labels) # Assuming your model can predict all.
print(f'Note: we can predict {total_predicted_species} species only!')

# Get all the data files
def get_oggs():
    if len(glob.glob('/kaggle/input/birdclef-2025/test_soundscapes/*.ogg')) > 0:
        oggs = glob.glob('/kaggle/input/birdclef-2025/test_soundscapes/*.ogg')
    else:
        oggs = sorted(glob.glob('/kaggle/input/birdclef-2025/train_soundscapes/*.ogg'))
    return [(n, ogg, re.search(r'/([^/]+)\.ogg$', ogg).group(1)) for n, ogg in enumerate(oggs)]

oggs = get_oggs()

# Process the files in threads
def bvc_result(ogg):
    _, fname, ss_id = ogg
    sr = 32_000

    #print(f'{ss_id}')
    row_ids = [f'{ss_id}_{n}' for n in range(5, 65, 5)]

    if time.time() > TERMINATE_TIME:
        return row_ids, -1000 * np.ones((12, len(primary_labels)))

    try:
        data, _ = librosa.load(fname, sr=sr)
        embeddings = []
        for start_time in range(0, len(data), 5 * sr):
            end_time = start_time + 5 * sr
            segment = data[start_time:end_time]
            if len(segment) < 5 * sr:
                segment = np.pad(segment, (0, 5 * sr - len(segment)))
            _, embedding, _ = yamnet_model(segment)
            embeddings.append(embedding.numpy().mean(axis=0))
        embeddings = np.array(embeddings)

        model_outputs = trained_model.predict(embeddings)
        model_outputs = tf.pad(model_outputs, tf.constant([[0, 0], [0, 1]]))
        result = model_outputs[:, birdclassifier_indices]
        return row_ids, result

    except Exception as e:
        print(f"Error processing {ss_id}: {e}")
        return row_ids, -1000 * np.ones((12, len(primary_labels)))

row_ids = []
result = []

with ThreadPoolExecutor(max_workers=4) as executor:
    for ogg_row_ids, ogg_result in executor.map(bvc_result, oggs):
        row_ids += ogg_row_ids
        result.append(ogg_result)

submission = pd.DataFrame(np.concatenate(result), columns=primary_labels)
submission['row_id'] = row_ids
submission = submission[['row_id'] + primary_labels]

# Write CSV
submission.to_csv('submission.csv', index=False)

# Display submission DataFrame
display(submission.head(20))
display(submission.tail(20))