## Imports

In [43]:
!cp ../input/features/extracted_features.pickle extracted_features.pickle

In [44]:
import os
import pickle
import numpy as np
import pandas as pd
# from tqdm.notebook import tqdm # loading bar
from tqdm import tqdm # loading bar
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import RandomOverSampler

In [45]:
# Check if running on Kaggle
kaggle = ('KAGGLE_KERNEL_RUN_TYPE' in os.environ)
if kaggle:
    # Code specific to Kaggle
    print("Running on Kaggle!")
else:
    print("Not running on Kaggle.")

Running on Kaggle!


In [46]:
def extract_numbers(filename):
    filename = filename.split('.')[0] # remove extension

    split = filename.split('_')

    if len(split) > 1:
        return split[1]
    elif len(split) == 1:
        return split[0]

In [47]:
if kaggle:
    DATA_DIR = '../input/birdclef-2024/'
    OUTPUT_DIR = '/kaggle/working/'
else: # local work
    DATA_DIR = "../../data/raw/" 
    OUTPUT_DIR = "../../data/processed/"

TRAIN_AUDIO_DIR = os.path.join(DATA_DIR, "train_audio/")

train_csv_path = os.path.join(DATA_DIR, "train_metadata.csv")

# Testing
TEST_AUDIO_DIR = os.path.join(DATA_DIR,"test_soundscapes/")

# Load list of audio files by parsing the test_soundscape folder
test_file_list = sorted(os.listdir(TEST_AUDIO_DIR))
test_file_list = [file for file in test_file_list if file.endswith('.ogg')]  # Filter only ogg files

if len(test_file_list) == 0:   # replace test dir by unlabeled dir for testing
    TEST_AUDIO_DIR = os.path.join(DATA_DIR, "unlabeled_soundscapes/")
    test_file_list = sorted(os.listdir(TEST_AUDIO_DIR))
    test_file_list = [file for file in test_file_list if file.endswith('.ogg')]  # Filter only ogg files
    test_file_list = test_file_list[:5]  # Take only 5 elements to speed up debugging

test_number_list = [extract_numbers(file) for file in test_file_list]

print(f'Directory used for testing: {TEST_AUDIO_DIR}')
print(f'Number of test files: {len(test_file_list)}')

Directory used for testing: ../input/birdclef-2024/unlabeled_soundscapes/
Number of test files: 5


In [48]:
SAMPLE_RATE = 32000

## Data collection

In [None]:
data = pd.read_csv(train_csv_path)

# Add complete filepath
data['filepath'] = data.apply(lambda row: os.path.join(TRAIN_AUDIO_DIR, row['filename']), axis=1)

# # Filter out large files
# data['filesize'] = data.apply(lambda row: os.path.getsize(row['filepath']), axis=1)

In [None]:
X_files = data['filepath']
y = data['primary_label']

In [None]:
list_species = sorted(data.primary_label.unique())

## Features

In [None]:
def extract_features(audio_data, sample_rate=SAMPLE_RATE):
    
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40)
    flattened_features = np.mean(mfccs, axis=1)
    
#     chroma = librosa.feature.chroma_stft(y=audio_data, sr=sample_rate)
#     features.append(np.mean(chroma, axis=1))

#     mel = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
#     features.append(np.mean(mel, axis=1))
    
    return flattened_features

def load_data_and_extract_features(X_files, sample_rate=SAMPLE_RATE):
    features_list = []

    for filepath in tqdm(X_files, desc='Processing files', total=len(X_files)):
        # Process data with tqdm
        audio_data, _ = librosa.load(filepath, sr=sample_rate)
        features = extract_features(audio_data, sample_rate=sample_rate)
        features_list.append(features)

    X = np.array(features_list)

    return X

In [None]:
preprocess = False

preprocessed_data_path = os.path.join(OUTPUT_DIR, 'extracted_features.pickle')
preprocessed_data_exists = os.path.exists(preprocessed_data_path)

In [None]:
if preprocess:
    X = load_data_and_extract_features(X_files)
    
    if not preprocessed_data_exists: # avoid overwriting file
        with open(preprocessed_data_path, "wb") as file:   #Pickling
            pickle.dump(X, file)
        
else:
    with open(preprocessed_data_path, "rb") as file:   # Unpickling
        X = pickle.load(file)
    

## Label Encoding

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
sns.histplot(data.primary_label.value_counts(), bins='rice');

## Balancing

In [None]:
# ros = RandomOverSampler(random_state=42)

# X_resampled, y_encoded_resampled = ros.fit_resample(X, y_encoded)

In [None]:
# pd.Series(y_encoded_resampled).value_counts()

In [None]:
# X_resampled.shape

## Model Training

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_encoded_resampled, test_size=0.2, random_state=42)

# # Train the classifier with the best parameters
# rf = RandomForestClassifier(max_depth=20, min_samples_leaf=2, n_estimators=300, random_state=42, verbose=2, n_jobs=-1)
# rf.fit(X_train, y_train)
# y_pred = best_classifier.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [None]:
# # Split the data into train, validation, and test sets
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_encoded_resampled, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [None]:
retrain = False

In [None]:
model_path = os.path.join(OUTPUT_DIR, 'best_rf.joblib')

if not os.path.exists(model_path) or retrain:

    # Initialize the Random Forest classifier
    rf = RandomForestClassifier(n_estimators=1, warm_start=True, random_state=42)

    best_val_accuracy = 0
    best_rf = None
    patience = 20  
    impatience_count = 0 

    for epoch in range(300): 
        # Train the Random Forest with one more tree
        rf.n_estimators += 1
        rf.fit(X_train, y_train)

        # Validate the model
        val_pred = rf.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_pred)

        print(f"Epoch {epoch+1}, Validation Accuracy: {val_accuracy}")

        # Check if validation accuracy improved
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_rf = rf
            impatience_count = 0  # Reset impatience count
        else:
            impatience_count += 1

        # Check if to stop training
        if impatience_count >= patience:
            print("Stopping training due to lack of improvement.")
            break
            
            
    # Save the classifier to a file
    joblib.dump(best_rf, model_path)
    
else:
    # Load the classifier from the file
    best_rf = joblib.load(model_path)
    print(f"Model loaded from file '{model_path}'")

# Evaluate the best Random Forest model on the test set
test_predictions = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")

## Testing and submission

In [None]:
# Function to split audio file into chunks of given duration
def split_audio(path, duration, sr):
    sig, rate = librosa.load(path, sr=sr)
    chunk_size = duration * rate
    chunks = [sig[i:i+chunk_size] for i in range(0, len(sig), chunk_size)]
    return chunks

In [None]:
# This is where we will store our results
row_id_list = []
X_test = []

# Process audio files and make predictions with tqdm progress bar
for audio_file, file_number in tqdm(zip(test_file_list, test_number_list), total=len(test_file_list), desc='Processing test files'):
    path = os.path.join(TEST_AUDIO_DIR, audio_file)

    # Split audio file into 5-second chunks
    audio_chunks = split_audio(path, duration=5, sr=SAMPLE_RATE)

    for i, chunk in enumerate(audio_chunks):
        chunk = audio_chunks[i]
        chunk_end_time = (i + 1) * 5
        row_id = f"soundscape_{file_number}_{chunk_end_time}"
        row_id_list.append(row_id)

        features = extract_features(chunk)

        X_test.append(features)
        

X_test = np.array(X_test) # convert list of 1D arrays to 2D array

In [None]:
# Prediction probabilities
y_test_pred_proba = best_rf.predict_proba(X_test)

In [None]:
results = pd.DataFrame({'row_id': row_id_list})

result_probs = pd.DataFrame(y_test_pred_proba, columns=label_encoder.classes_)
results = pd.concat([results, result_probs], axis=1)

# Reorder the columns to have 'row_id' first, followed by list_species
columns_order = ['row_id'] + list_species
results = results.reindex(columns=columns_order).fillna(0)

results


In [None]:
# Convert our results to csv
results.to_csv("submission.csv", index=False)