In [1]:
kaggle = True
submission = True # change to True before submitting

## Imports

In [2]:
import os
import numpy as np
import pandas as pd
# from tqdm.notebook import tqdm # loading bar
from tqdm import tqdm # loading bar

import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

## Training

In [3]:
if kaggle:
    DATA_DIR = '../input/birdclef-2024/'
else:
    DATA_DIR = "../../data/raw" # local work
    
TRAIN_AUDIO_DIR = os.path.join(DATA_DIR, "train_audio/")

if submission:
    TEST_AUDIO_DIR = os.path.join(DATA_DIR,"test_soundscapes/")
    
else:
    TEST_AUDIO_DIR = os.path.join(DATA_DIR,"unlabeled_soundscapes/")

train_csv_path = os.path.join(DATA_DIR, "train_metadata.csv")

In [4]:
train_df = pd.read_csv(train_csv_path)

# Add complete filepath
train_df['filepath'] = train_df.apply(lambda row: os.path.join(TRAIN_AUDIO_DIR, row['filename']), axis=1)

# Filter out large files
train_df['filesize'] = train_df.apply(lambda row: os.path.getsize(row['filepath']), axis=1)
train_df = train_df[train_df['filesize'] < 1e6]

In [5]:
list_species = sorted(train_df.primary_label.unique())

In [6]:
random_state = 43

if submission:
    num_classes_to_keep = len(list_species)
    fraction_to_keep = 0.5
else:
    num_classes_to_keep = 10
    fraction_to_keep = 0.05

# Calculate the minimum number of instances to keep for classes with fewer labels
min_count = 50

# Calculate weights to balance the classes
class_weights = train_df['primary_label'].value_counts()

# Select the top classes to keep based on their frequencies
top_classes = class_weights.head(num_classes_to_keep).index.tolist()

# Initialize an empty DataFrame to store the sampled subset
train_subset_df = pd.DataFrame()

# Iterate over each class
for label, count in class_weights.items():
    # Check if the class is in the top classes to keep
    if label in top_classes:
        # Check if the class has fewer labels than the minimum count
        if count < min_count:
            # Keep all instances for classes with fewer labels
            subset = train_df[train_df['primary_label'] == label]
        else:
            # Randomly sample a fraction for classes with more labels
            fraction = min(fraction_to_keep, min_count / count)  # Adjust fraction if necessary
            subset = train_df[train_df['primary_label'] == label].sample(frac=fraction, random_state=random_state)
        # Append the subset to the final DataFrame
        train_subset_df = pd.concat([train_subset_df, subset])

# Shuffle the final DataFrame to mix the classes
train_subset_df = train_subset_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

In [7]:
train_subset_df['primary_label'].value_counts()

primary_label
eurcoo     50
grewar3    50
wemhar1    50
comgre     50
labcro1    50
           ..
blaeag1     6
wynlau1     6
integr      5
asiope1     5
niwpig1     5
Name: count, Length: 182, dtype: int64

In [8]:
if submission: # No train val split
    X_train_files = train_subset_df.filepath
    y_train = train_subset_df.primary_label
else:
    # Train val split
    train_train_df, val_df = train_test_split(train_subset_df, test_size=0.3, stratify = train_subset_df.primary_label, random_state=random_state) 
    X_train_files = train_train_df.filepath
    X_val_files = val_df.filepath

    y_train = train_train_df.primary_label
    y_val = val_df.primary_label

### Features

In [9]:
def extract_features(audio_data, sample_rate=32000, mfcc=True, chroma=True, mel=True):
    result = np.array([])
    if mfcc: # Mel-Frequency Cepstral Coefficients
        mfccs = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio_data, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio_data, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
    
    return result

In [10]:
def extract_features_filepaths(X_files, sample_rate=32000):
    features = []
    
    for filepath in tqdm(X_files, desc='Processing files', total=len(X_files)):
        # Process data with tqdm
        audio_data, _ = librosa.load(filepath, sr=sample_rate)
        audio_features = extract_features(audio_data, sample_rate)

        # Append features and label
        features.append(audio_features)
            
    X = np.array(features)  
    
    return X

In [11]:
X_train = extract_features_filepaths(X_train_files)

  return pitch_tuning(
Processing files: 100%|██████████| 6615/6615 [26:52<00:00,  4.10it/s]


In [12]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [13]:
# X_val = extract_features_filepaths(X_val_files)
# y_val_encoded = label_encoder.transform(y_val)

### Model

In [14]:
# Train the classifier with the best parameters
best_classifier = RandomForestClassifier(max_depth=20, min_samples_leaf=2, n_estimators=300, random_state=42)
best_classifier.fit(X_train, y_train_encoded)

In [15]:
# y_val_pred_proba = best_classifier.predict_proba(X_val)

## Testing and submission

In [16]:
def extract_numbers(filename):
    filename = filename.split('.')[0] # remove extension
    
    split = filename.split('_')
    
    if len(split) > 1:
        return split[1]
    elif len(split) == 1:
        return split[0]

In [17]:
# First, load list of audio files by parsing the test_soundscape folder.
test_file_list = sorted(os.listdir(TEST_AUDIO_DIR))
test_file_list = [file for file in test_file_list if file.endswith('.ogg')] # filter only ogg files

if len(test_file_list) == 0:  # for debugging purposes when not submitting
    TEST_AUDIO_DIR = os.path.join(DATA_DIR,"unlabeled_soundscapes/")
    test_file_list = sorted(os.listdir(TEST_AUDIO_DIR))
    test_file_list = [file for file in test_file_list if file.endswith('.ogg')] # filter only ogg files
    test_file_list = test_file_list[:5] # take only 5 elements to go faster on debugging

test_number_list = [extract_numbers(file) for file in test_file_list]
    
print(f'Number of test files: {len(test_file_list)}')

Number of test files: 5


In [18]:
# Function to split audio file into chunks of given duration
def split_audio(path, duration, sr):
    sig, rate = librosa.load(path, sr=sr)
    chunk_size = duration * rate
    chunks = [sig[i:i+chunk_size] for i in range(0, len(sig), chunk_size)]
    return chunks

In [19]:
# This is where we will store our results
row_id_list = []
X_test = []

# Process audio files and make predictions with tqdm progress bar
for audio_file, file_number in tqdm(zip(test_file_list, test_number_list), total=len(test_file_list), desc='Processing test files'):
    path = os.path.join(TEST_AUDIO_DIR, audio_file)
        
    # Split audio file into 5-second chunks
    audio_chunks = split_audio(path, duration=5, sr=32000)
    
    for i in range(48): # assuming files of 4 minutes = 240 seconds
    # for i, chunk in enumerate(audio_chunks):
        chunk = audio_chunks[i]
        chunk_end_time = (i + 1) * 5
        row_id = f"soundscape_{file_number}_{chunk_end_time}"
        row_id_list.append(row_id)
        
        features = extract_features(chunk)
        
        X_test.append(features)

Processing test files: 100%|██████████| 5/5 [00:11<00:00,  2.32s/it]


In [20]:
X_test = np.array(X_test) # convert list of 1D arrays to 2D array

if len(X_test) > 0:
    y_pred_proba = best_classifier.predict_proba(X_test)
else:
    y_pred_proba = [0 for class_ in label_encoder.classes_] # just for debugging

In [21]:
results = pd.DataFrame({'row_id':row_id_list})
results[label_encoder.classes_] = y_pred_proba

# Update the order of the columns to be ordered, and to contain all species
columns_order = ['row_id'] + list_species

results = results.reindex(columns=columns_order).fillna(0)
results

  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_encoder.classes_] = y_pred_proba
  results[label_enco

Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whbwoo2,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1
0,soundscape_1000170626_5,0.003522,0.014063,0.007287,0.004493,0.002801,0.000398,0.002887,0.003057,0.001515,...,0.002275,0.001502,0.005354,0.006488,0.003125,0.043256,0.001793,0.001186,0.000993,0.002770
1,soundscape_1000170626_10,0.004504,0.016342,0.006306,0.005184,0.002700,0.000282,0.003746,0.003297,0.001310,...,0.002079,0.001343,0.005722,0.006560,0.002991,0.047011,0.001760,0.001184,0.000922,0.004085
2,soundscape_1000170626_15,0.003797,0.006256,0.009692,0.004934,0.003045,0.000520,0.003754,0.003779,0.001646,...,0.002446,0.001597,0.006192,0.003488,0.002805,0.030601,0.002155,0.001180,0.000906,0.004471
3,soundscape_1000170626_20,0.003871,0.008815,0.005320,0.005506,0.003091,0.000325,0.003213,0.004353,0.001644,...,0.002293,0.001596,0.006329,0.004611,0.003217,0.032571,0.002249,0.001323,0.001093,0.004561
4,soundscape_1000170626_25,0.003751,0.006400,0.009537,0.003829,0.002991,0.000310,0.003582,0.004513,0.001564,...,0.002325,0.001517,0.006196,0.003595,0.002965,0.039973,0.002356,0.001179,0.000906,0.004596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,soundscape_1000450112_220,0.006019,0.004152,0.005088,0.003037,0.003189,0.004419,0.002925,0.001760,0.001321,...,0.006437,0.006090,0.005595,0.005764,0.006686,0.012849,0.001482,0.001302,0.001378,0.004145
236,soundscape_1000450112_225,0.006126,0.005980,0.005381,0.002514,0.003258,0.004439,0.002801,0.001893,0.001335,...,0.005600,0.006335,0.005407,0.002706,0.006509,0.011552,0.001284,0.001064,0.001549,0.004105
237,soundscape_1000450112_230,0.006413,0.004018,0.005351,0.002567,0.003249,0.004431,0.002854,0.001914,0.001563,...,0.005558,0.006351,0.005500,0.002433,0.006769,0.011246,0.001221,0.001033,0.001630,0.004062
238,soundscape_1000450112_235,0.006401,0.003869,0.005128,0.002459,0.003222,0.003320,0.002704,0.001883,0.001296,...,0.005407,0.006365,0.005437,0.002634,0.008286,0.010563,0.001243,0.001060,0.001533,0.004041


In [22]:
# Convert our results to csv
results.to_csv("submission.csv", index=False)  