In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torchvision
import librosa as lb
import torch.nn as nn
from torchvision.models import efficientnet_v2_m

class CFG():
    def __init__(self):
        self.device = torch.device("cpu")
        self.sample_rate = 32000
        self.n_mels = 224
        self.n_fft = 2048
        self.hop_length = 512
        self.max_length_s = 5
        self.f_max = 16000
        self.f_min = 20
        self.num_classes = 182
        self.model_path = "model.pth"
        self.data_path = "data/unlabeled_soundscapes/"
        self.output_path = "submission.csv"

config = CFG()

model = efficientnet_v2_m()
model.classifier[1] = nn.Linear(
    model.classifier[1].in_features, config.num_classes
)
model.load_state_dict(torch.load(config.model_path, map_location=config.device))
model.eval()
model.to(config.device)


In [2]:
def list_audio_files(data_dir):
    filepaths = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".ogg")]
    return pd.DataFrame({"filepath": filepaths})

def process_audio(file_path):
    audio, _ = lb.load(file_path, sr=config.sample_rate, mono=True)
    chunk_size = config.sample_rate * config.max_length_s
    chunks = [audio[i:i+chunk_size] for i in range(0, len(audio), chunk_size)]
    return chunks

def mel_spectrogram(chunk):
    mel_spec = lb.feature.melspectrogram(
        y=chunk, sr=config.sample_rate, n_fft=config.n_fft,
        hop_length=config.hop_length, n_mels=config.n_mels,
        fmin=config.f_min, fmax=config.f_max
    )
    mel_spec_db = lb.power_to_db(mel_spec, ref=np.max)
    
    mel_spec_3ch = np.repeat(mel_spec_db[np.newaxis, :, :], 3, axis=0)
    return mel_spec_3ch


def load_species_labels(metadata_path):
    data = pd.read_csv(metadata_path)
    labels = sorted(list(data['primary_label'].unique()))
    return labels

LABELS = load_species_labels('data/train_metadata.csv')

In [3]:
def generate_predictions(chunks, model, config):
    predictions = []
    sigmoid = nn.Sigmoid()
    for chunk in chunks:
        mel_spec = mel_spectrogram(chunk)
        mel_tensor = torch.tensor(mel_spec, dtype=torch.float32).unsqueeze(0).to(config.device)
        with torch.no_grad():
            pred = sigmoid(model(mel_tensor))
            predictions.append(pred.cpu().numpy())
    return predictions

def save_predictions_to_csv(filename, predictions):
    predictions = np.squeeze(np.array(predictions), axis=1)

    row_ids = [f"soundscape_{filename}_{(i+1)*config.max_length_s}" for i in range(len(predictions))]

    pred_df = pd.DataFrame(predictions, columns=LABELS)
    pred_df.insert(0, "row_id", row_ids)

    pred_df.to_csv(config.output_path, mode="a", index=False, header=not os.path.exists(config.output_path))

In [4]:
audio_df = list_audio_files(config.data_path)

In [None]:
for _, row in audio_df.iterrows():
    file_path = row["filepath"]
    filename = row["filepath"].split("/")[-1].replace(".ogg", "")

    chunks = process_audio(file_path)

    predictions = generate_predictions(chunks, model, config)

    save_predictions_to_csv(filename, predictions)

print(f"Predictions saved to {config.output_path}")