# Imports

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import noisereduce as nr

# Config

In [None]:
from config import run_config

run_config()

# Helper Functions

In [None]:
def load_audio(path, sr=16000):
    try:
        y, _ = librosa.load(path, sr=sr)
        return y
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return None

In [None]:
def preprocess_audio(y, sr=16000):
    if y is None:
        return None
    # Remove silence
    intervals = librosa.effects.split(y, top_db=20)
    y_trimmed = np.concatenate([y[start:end] for start, end in intervals])
    
    # Normalize volume
    y_norm = librosa.util.normalize(y_trimmed)
    
    # Noise reduction
    y_denoised = nr.reduce_noise(y=y_norm, sr=sr)

    # Padding/trimming to fixed length (e.g., 5 sec)
    desired_length = sr * 5
    if len(y_denoised) > desired_length:
        y_denoised = y_denoised[:desired_length]
    else:
        y_denoised = np.pad(y_denoised, (0, max(0, desired_length - len(y_denoised))))
    
    return y_denoised

# Preprocessing

In [None]:
df = pd.read_csv("data/filtered_data_labeled.tsv", sep='\t')
df.head()

In [None]:
# Filter out corrupt or unreadable audio
df['audio_data'] = df['audio'].apply(lambda path: preprocess_audio(load_audio(path)))

# Drop rows with failed audio loading
df = df[df['audio_data'].notnull()]
df.reset_index(drop=True, inplace=True)

In [None]:
# Extract MFCC features (or replace with other feature extractors)
df['features'] = df['audio_data'].apply(lambda y: librosa.feature.mfcc(y=y, sr=16000, n_mfcc=40).mean(axis=1))

X = np.stack(df['features'].values)
y = df['label'].values

In [None]:
# Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.15, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)