In [None]:
kaggle = True
submission = True # change to True before submitting

## Imports

In [None]:
import os
import numpy as np
import pandas as pd
# from tqdm.notebook import tqdm # loading bar
from tqdm import tqdm # loading bar

import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

## Training

In [None]:
if kaggle:
    DATA_DIR = '../input/birdclef-2024/'
else:
    DATA_DIR = "../../data/raw"
    
TRAIN_AUDIO_DIR = os.path.join(DATA_DIR, "train_audio/")
TEST_AUDIO_DIR = os.path.join(DATA_DIR,"test_soundscapes/")
UNLABELED_AUDIO_DIR = os.path.join(DATA_DIR,"unlabeled_soundscapes/")

train_csv_path = os.path.join(DATA_DIR, "train_metadata.csv")
sample_submision_path = os.path.join(DATA_DIR, "sample_submission.csv")

In [None]:
train_df = pd.read_csv(train_csv_path)

# Add complete filepath
train_df['filepath'] = train_df.apply(lambda row: os.path.join(TRAIN_AUDIO_DIR, row['filename']), axis=1)

# Filter out large files
train_df['filesize'] = train_df.apply(lambda row: os.path.getsize(row['filepath']), axis=1)
train_df = train_df[train_df['filesize'] < 1e6]

In [None]:
traib_subset_df.head()

In [None]:
random_state = 43

# Define the number of classes to keep

if submission:
    num_classes_to_keep = train_df['primary_label'].nunique()
    # Define the fraction of data to keep for classes with more labels
    fraction_to_keep = 0.1
else:
    num_classes_to_keep = 100
    fraction_to_keep = 0.05

# Calculate the minimum number of instances to keep for classes with fewer labels
min_count = 50

# Calculate weights to balance the classes
class_weights = train_df['primary_label'].value_counts()

# Select the top classes to keep based on their frequencies
top_classes = class_weights.head(num_classes_to_keep).index.tolist()

# Initialize an empty DataFrame to store the sampled subset
train_subset_df = pd.DataFrame()

# Iterate over each class
for label, count in class_weights.items():
    # Check if the class is in the top classes to keep
    if label in top_classes:
        # Check if the class has fewer labels than the minimum count
        if count < min_count:
            # Keep all instances for classes with fewer labels
            subset = train_df[train_df['primary_label'] == label]
        else:
            # Randomly sample a fraction for classes with more labels
            fraction = min(fraction_to_keep, min_count / count)  # Adjust fraction if necessary
            subset = train_df[train_df['primary_label'] == label].sample(frac=fraction, random_state=random_state)
        # Append the subset to the final DataFrame
        train_subset_df = pd.concat([train_subset_df, subset])

# Shuffle the final DataFrame to mix the classes
train_subset_df = train_subset_df.sample(frac=1, random_state=random_state).reset_index(drop=True)


In [None]:
train_subset_df['primary_label'].value_counts()

In [None]:
if submission: # No train val split
    X_train_files = train_subset_df.filepath
    y_train = train_train_df.primary_label
else:
    # Train val split
    train_train_df, val_df = train_test_split(train_subset_df, test_size=0.3, stratify = train_subset_df.primary_label, random_state=random_state) 
    X_train_files = train_train_df.filepath
    X_val_files = val_df.filepath

    y_train = train_train_df.primary_label
    y_val = val_df.primary_label

### Features

In [None]:
def extract_features(audio_data, sample_rate=32000, mfcc=True, chroma=True, mel=True):
    result = np.array([])
    if mfcc: # Mel-Frequency Cepstral Coefficients
        mfccs = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio_data, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio_data, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
    
    return result

In [None]:
def extract_features_filepaths(X_files, sample_rate=32000):
    features = []
    
    for filepath in tqdm(X_files, desc='Processing files', total=len(X_files)):
        # Process data with tqdm
        audio_data, _ = librosa.load(filepath, sr=sample_rate)
        audio_features = extract_features(audio_data, sample_rate)

        # Append features and label
        features.append(audio_features)
            
    X = np.array(features)  
    
    return X 


In [None]:
X_train = extract_features_filepaths(X_train_files)

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [None]:
#X_val = extract_features_filepaths(X_val_files)

# y_val_encoded = label_encoder.transform(y_val)

### Model

In [None]:
# Train the classifier with the best parameters
best_classifier = RandomForestClassifier(max_depth=20, min_samples_leaf=2, n_estimators=300, random_state=42)
best_classifier.fit(X_train, y_train_encoded)

In [None]:
# y_val_pred_proba = best_classifier.predict_proba(X_val)

## Testing and submission

In [None]:
def extract_numbers(row_id):
    parts = row_id.split('_')
    return parts[1]

# First, load list of audio files by parsing the test_soundscape folder.

if submission:
    test_df = pd.read_csv(sample_submision_path)
    test_df['numbers'] = test_df['row_id'].apply(extract_numbers)
    file_list = test_df['numbers'].unique().tolist()
    print('Number of test soundscapes:', len(file_list))
    
else:
    file_list = ['1000170626']

In [None]:
# Function to split audio file into chunks of given duration
def split_audio(path, duration, sr):
    sig, rate = librosa.load(path, sr=sr)
    chunk_size = duration * rate
    chunks = [sig[i:i+chunk_size] for i in range(0, len(sig), chunk_size)]
    return chunks

In [None]:
# This is where we will store our results

pred = {'row_id': [], 'proba': []}

# Process audio files and make predictions
for afile in file_list:
    
    if submission:
        filename = 'soundscapes_' + afile + '.ogg'
        path = os.path.join(TEST_AUDIO_DIR, filename)
    else:
        filename = afile + '.ogg'
        path = os.path.join(UNLABELED_AUDIO_DIR, filename)
        
    
    # Split audio file into 5-second chunks
    audio_chunks = split_audio(path, duration=5, sr=32000)
    
    # Assign the row_id which we need to do for each chunk
    for i, chunk in enumerate(audio_chunks):
        chunk_end_time = (i + 1) * 5
        row_id = f"soundscape_{afile}_chunk{chunk_end_time}"
        pred['row_id'].append(row_id)
        
        X_test = extract_features(chunk)  # Assuming you have a function to extract features from audio data
        X_test = np.expand_dims(X_test, 0)

        y_pred_proba = best_classifier.predict_proba(X_test)
        pred['proba'].append(y_pred_proba[0])

In [None]:
# Convert the 'proba' array into a DataFrame
results = pd.DataFrame(pred['proba'], columns=label_encoder.classes_)

# Add the 'row_id' column to the DataFrame
results['row_id'] = pred['row_id']

# Reorder the columns so that 'row_id' comes first
results = results[['row_id'] + list(label_encoder.classes_)]

In [None]:
results.head()

In [None]:
# Convert our results to csv
results.to_csv("submission.csv", index=False)  