# Imports

In [1]:
import numpy as np
import pandas as pd

import psutil
from pathlib import Path
from tqdm import tqdm

from modules.preprocessing import AudioPreprocessor
from modules.feature_extraction import FeatureExtractor
from modules.pipelines import ModelPipeline
from modules.evaluate import PerformanceAnalyzer

from models.catboost import CatBoostModel
from models.lightgbm import LightGBMModel
from models.xgboost import XGBoostModel
from models.gmboost import GradientBoostingModel

from concurrent.futures import ThreadPoolExecutor, as_completed

import warnings
warnings.filterwarnings("ignore")

# Config

In [2]:
from config import run_config, NUM_WORKERS, DATA_DIR, AUDIO_PATH

run_config()

# Load Dataset

In [3]:
df = pd.read_csv(DATA_DIR / "filtered_data_labeled.tsv", sep='\t')
df.head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3


In [4]:
df['path'] = df['path'].apply(lambda x: AUDIO_PATH / x)
df.head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,data\audios\common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3


# Features

In [5]:
from typing import Optional, Tuple, List

# === Parallel Processing ===
def process_sample(
    row: pd.Series, 
    idx: int, 
    mode: str, 
    preprocessor: AudioPreprocessor, 
    extractor: FeatureExtractor, 
    force_update: bool
) -> Optional[Tuple[int, np.ndarray, str]]:

    y_proc: Optional[np.ndarray] = preprocessor.load_cached_preprocessed(idx) if not force_update else None
    
    # Load and preprocess audio if not cached or force_update is True
    if y_proc is None or force_update:
        y_raw: Optional[np.ndarray] = preprocessor.load_audio(row['path'])
        if y_raw is None:
            # print(f"Failed to load audio for index {idx}.")
            return None
        y_proc = preprocessor.preprocess(y_raw)
        preprocessor.cache_preprocessed(idx, y_proc, force_update)
    
    # Extract features
    feat = extractor.extract(y_proc, sr=16000, mode=mode)
    return idx, feat, row['label']


def process_batch(
    batch_df: pd.DataFrame, 
    mode: str, 
    preprocessor: AudioPreprocessor, 
    extractor: FeatureExtractor, 
    force_update: bool, 
    offset: int = 0
) -> List[Tuple[int, np.ndarray, str]]:
    results: List[Tuple[int, np.ndarray, str]] = []
    
    for i, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Batch {offset}", leave=False):
        result: Optional[Tuple[int, np.ndarray, str]] = process_sample(row, i, mode, preprocessor, extractor, force_update)
        if result: results.append(result)
    
    return results

In [6]:
def prepare_features_parallel(df, mode="traditional", force_update_preprocessing=False, force_update_features=False, batch_size=None):
    print(f"🔄 Preparing features in {mode} mode...")
    extractor = FeatureExtractor()
    X_cached, y_cached = extractor.load_cached_features(mode)
    if X_cached is not None and not force_update_features:
        return X_cached, y_cached
    
    print("🔄 Loading and preprocessing audio...")
    preprocessor = AudioPreprocessor()
    features_dict, labels_dict = {}, {}

    # Auto-select batch size based on available memor
    total_memory_gb = psutil.virtual_memory().total / (1024 ** 3)
    est_mem_per_sample = 0.01 if mode == "traditional" else 0.2
    est_batch_size = max(10, int((total_memory_gb * 0.4) / est_mem_per_sample))
    batch_size = batch_size or min(est_batch_size, len(df) // NUM_WORKERS)
    if total_memory_gb < 2:
        print("⚠️ Warning: Low memory detected. Reducing batch size to avoid OOM errors.")
        batch_size = min(batch_size, 10)
    print(f"🧠 Auto-selected batch size: {batch_size} (Estimated memory per sample: {est_mem_per_sample:.2f} GB, Total RAM: {total_memory_gb:.2f} GB)")

    batches = [df.iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    print(f"🔄 Total batches: {len(batches)}")

    print("📦 Processing batches:")
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        futures = {
            executor.submit(process_batch, batch, mode, preprocessor, extractor, force_update_preprocessing, i): i
            for i, batch in enumerate(batches)
            }
        for future in tqdm(as_completed(futures), total=len(futures), desc="📊 Batches Done"):
            batch_results = future.result()
            if batch_results:
                for idx, feat, label in batch_results:
                    features_dict[idx] = feat
                    labels_dict[idx] = label

    print("🔄 Finished processing batches.")
    # for i in range(len(batches)): extractor.remove_cached_features(mode, index=i)

    sorted_indices = sorted(features_dict.keys())
    X = np.array([features_dict[i] for i in sorted_indices])
    y = np.array([labels_dict[i] for i in sorted_indices])
    extractor.cache_features(X, y, mode=mode, force_update=force_update_features)
    return X, y

In [7]:
X, y = prepare_features_parallel(df, mode="traditional", force_update_features=False, force_update_preprocessing=False) # , batch_size=250

🔄 Preparing features in traditional mode...


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
X.shape, y.shape

((172158, 147), (172158,))

# Models

In [10]:
# LightGBM
model = ModelPipeline(model=LightGBMModel)
metrics = model.train(X, y, use_optuna=True, n_trials=10)
metrics

[I 2025-04-26 13:06:26,450] A new study created in memory with name: no-name-a3e01ce7-1125-444a-894b-a497387e6cac
[I 2025-04-26 13:07:05,744] Trial 9 finished with value: 0.7918796468401487 and parameters: {'learning_rate': 0.02351149351726154, 'num_leaves': 102, 'max_depth': 4}. Best is trial 9 with value: 0.7918796468401487.
[I 2025-04-26 13:07:29,884] Trial 0 finished with value: 0.6922049256505576 and parameters: {'learning_rate': 0.0015588977578208834, 'num_leaves': 23, 'max_depth': 8}. Best is trial 9 with value: 0.7918796468401487.
[I 2025-04-26 13:07:30,521] Trial 7 finished with value: 0.774860594795539 and parameters: {'learning_rate': 0.010801054674113093, 'num_leaves': 48, 'max_depth': 5}. Best is trial 9 with value: 0.7918796468401487.
[I 2025-04-26 13:07:44,459] Trial 1 finished with value: 0.8702369888475836 and parameters: {'learning_rate': 0.18408099003672337, 'num_leaves': 36, 'max_depth': 8}. Best is trial 1 with value: 0.8702369888475836.
[I 2025-04-26 13:08:26,083]

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     11917\n           1       1.00      1.00      1.00      1920\n           2       1.00      0.99      0.99      1744\n           3       1.00      1.00      1.00      1635\n\n    accuracy                           1.00     17216\n   macro avg       1.00      1.00      1.00     17216\nweighted avg       1.00      1.00      1.00     17216\n'

In [11]:
# XGBoost
model = ModelPipeline(model=XGBoostModel)
metrics = model.train(X, y, use_optuna=True, n_trials=10)
metrics

[I 2025-04-26 13:09:36,602] A new study created in memory with name: no-name-59532b30-dfeb-4fb8-8b2a-05200871f29c
[I 2025-04-26 13:14:05,651] Trial 4 finished with value: 0.7794493494423792 and parameters: {'learning_rate': 0.0016911611836628866, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.912039132908897, 'colsample_bytree': 0.7886613496002368}. Best is trial 4 with value: 0.7794493494423792.
[I 2025-04-26 13:14:06,066] Trial 2 finished with value: 0.8363150557620818 and parameters: {'learning_rate': 0.057761785377502387, 'max_depth': 7, 'n_estimators': 106, 'subsample': 0.6532688090324347, 'colsample_bytree': 0.8209852684032211}. Best is trial 2 with value: 0.8363150557620818.
[I 2025-04-26 13:25:57,601] Trial 9 finished with value: 0.7950743494423792 and parameters: {'learning_rate': 0.006273522111194924, 'max_depth': 4, 'n_estimators': 763, 'subsample': 0.7495534446454752, 'colsample_bytree': 0.6465407785925488}. Best is trial 2 with value: 0.8363150557620818.
[I 2025-04-26

'              precision    recall  f1-score   support\n\n           0       0.97      1.00      0.98     11917\n           1       0.99      0.99      0.99      1920\n           2       1.00      0.80      0.89      1744\n           3       0.99      0.98      0.99      1635\n\n    accuracy                           0.98     17216\n   macro avg       0.99      0.94      0.96     17216\nweighted avg       0.98      0.98      0.97     17216\n'

In [9]:
# CatBoost
model = ModelPipeline(model=CatBoostModel)
metrics = model.train(X, y, use_optuna=True, n_trials=10)
metrics

[I 2025-04-30 16:03:33,723] A new study created in memory with name: no-name-7be00644-8876-4fa2-864f-6b48f99e1d33
[I 2025-04-30 16:03:57,990] Trial 0 finished with value: 0.8123257434944238 and parameters: {'iterations': 617, 'learning_rate': 0.007630827081460776, 'depth': 8, 'l2_leaf_reg': 0.009735441086428834}. Best is trial 0 with value: 0.8123257434944238.
[I 2025-04-30 16:04:08,591] Trial 1 finished with value: 0.7638824349442379 and parameters: {'iterations': 933, 'learning_rate': 0.037320437338869826, 'depth': 10, 'l2_leaf_reg': 1.241515815615068e-06}. Best is trial 0 with value: 0.8123257434944238.
[I 2025-04-30 16:04:15,004] Trial 2 finished with value: 0.7466310408921933 and parameters: {'iterations': 604, 'learning_rate': 0.003970742882116975, 'depth': 5, 'l2_leaf_reg': 1.4860765108197247e-07}. Best is trial 0 with value: 0.8123257434944238.
[I 2025-04-30 16:04:33,596] Trial 3 finished with value: 0.8805762081784386 and parameters: {'iterations': 934, 'learning_rate': 0.0887

'              precision    recall  f1-score   support\n\n           0       0.91      0.99      0.95     11917\n           1       0.90      0.91      0.90      1920\n           2       0.96      0.46      0.62      1744\n           3       0.91      0.83      0.87      1635\n\n    accuracy                           0.91     17216\n   macro avg       0.92      0.80      0.83     17216\nweighted avg       0.91      0.91      0.90     17216\n'

LightGBM
                  precision    recall  f1-score   support

           0       1.00      1.00      1.00     11917
           1       1.00      1.00      1.00      1920
           2       1.00      0.99      0.99      1744
           3       1.00      1.00      1.00      1635

    accuracy                           1.00     17216
   macro avg       1.00      1.00      1.00     17216
weighted avg       1.00      1.00      1.00     17216

XGBoost
                  precision    recall  f1-score   support

           0       0.97      1.00      0.98     11917
           1       0.99      0.99      0.99      1920
           2       1.00      0.80      0.89      1744
           3       0.99      0.98      0.99      1635

    accuracy                           0.98     17216
   macro avg       0.99      0.94      0.96     17216
weighted avg       0.98      0.98      0.97     17216

CatBoost
                  precision    recall  f1-score   support

           0       0.91      0.99      0.95     11917
           1       0.90      0.91      0.90      1920
           2       0.96      0.46      0.62      1744
           3       0.91      0.83      0.87      1635

    accuracy                           0.91     17216
   macro avg       0.92      0.80      0.83     17216
weighted avg       0.91      0.91      0.90     17216

# Test Inference

In [None]:
# === Batch Inference Utility ===
def run_batch_inference(model, input_folder, output_path, sr=16000, feature_mode="traditional"):
    extractor = FeatureExtractor()
    preprocessor = AudioPreprocessor()
    results = []

    for file in Path(input_folder).rglob("*.wav"):
        y = preprocessor.preprocess(preprocessor.load_audio(str(file), sr=sr))
        if y is not None:
            x = extractor.extract(y, sr=sr, mode=feature_mode).reshape(1, -1)
            pred = model.predict(x)[0]
            results.append({"file": file.name, "prediction": pred})

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    print(f"✅ Batch inference saved to {output_path}")
