# Imports

In [1]:
import numpy as np
import pandas as pd

import psutil
from pathlib import Path
from tqdm import tqdm

from modules.preprocessing import AudioPreprocessor
from modules.feature_extraction import FeatureExtractor
from modules.pipelines import ModelPipeline
# from modules.evaluate import PerformanceAnalyzer

from models.catboost import CatBoostModel
from models.lightgbm import LightGBMModel
from models.xgboost import XGBoostModel
# from models.gmboost import GradientBoostingModel

from concurrent.futures import ThreadPoolExecutor, as_completed

import warnings
warnings.filterwarnings("ignore")

# Config

In [2]:
from config import run_config, NUM_WORKERS, DATA_DIR, AUDIO_PATH

run_config()

# Load Dataset

In [3]:
df = pd.read_csv(DATA_DIR / "filtered_data_labeled.tsv", sep='\t')
df.head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3


In [4]:
df['path'] = df['path'].apply(lambda x: AUDIO_PATH / x)
df.head(1)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,label
0,5001d9a0d3f8f5aae6f386f70713b2d5d046edc7ba0068...,data\audios\common_voice_en_19687170.mp3,He associated with the Formists.,2,1,fifties,female,us,3


# Features

In [5]:
from typing import Optional, Tuple, List

# === Parallel Processing ===
def process_sample(
    row: pd.Series, 
    idx: int, 
    mode: str, 
    preprocessor: AudioPreprocessor, 
    extractor: FeatureExtractor, 
    force_update: bool
) -> Optional[Tuple[int, np.ndarray, str]]:

    y_proc: Optional[np.ndarray] = preprocessor.load_cached_preprocessed(idx) if not force_update else None
    
    # Load and preprocess audio if not cached or force_update is True
    if y_proc is None or force_update:
        y_raw: Optional[np.ndarray] = preprocessor.load_audio(row['path'])
        if y_raw is None:
            # print(f"Failed to load audio for index {idx}.")
            return None
        y_proc = preprocessor.preprocess(y_raw)
        preprocessor.cache_preprocessed(idx, y_proc, force_update)
    
    # Extract features
    feat = extractor.extract(y_proc, sr=16000, mode=mode)
    return idx, feat, row['label']


def process_batch(
    batch_df: pd.DataFrame, 
    mode: str, 
    preprocessor: AudioPreprocessor, 
    extractor: FeatureExtractor, 
    force_update: bool, 
    offset: int = 0
) -> List[Tuple[int, np.ndarray, str]]:
    results: List[Tuple[int, np.ndarray, str]] = []
    
    for i, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Batch {offset}", leave=False):
        result: Optional[Tuple[int, np.ndarray, str]] = process_sample(row, i, mode, preprocessor, extractor, force_update)
        if result: results.append(result)
    
    return results

In [6]:
def prepare_features_parallel(df, mode="traditional", version: Optional[int] = None, force_update_preprocessing=False, force_update_features=False, batch_size=None):
    print(f"🔄 Preparing features in {mode} mode...")
    extractor = FeatureExtractor()
    # Get current version of features
    if version is None: version = extractor.get_latest_version(mode)
    X_cached, y_cached = extractor.load_cached_features(mode, version=version)
    if X_cached is not None and not force_update_features:
        return X_cached, y_cached
    
    print("🔄 Loading and preprocessing audio...")
    preprocessor = AudioPreprocessor()
    features_dict, labels_dict = {}, {}

    # Auto-select batch size based on available memor
    total_memory_gb = psutil.virtual_memory().total / (1024 ** 3)
    est_mem_per_sample = 0.01 if mode == "traditional" else 0.2
    est_batch_size = max(10, int((total_memory_gb * 0.4) / est_mem_per_sample))
    batch_size = batch_size or min(est_batch_size, len(df) // NUM_WORKERS)
    if total_memory_gb < 2:
        print("⚠️ Warning: Low memory detected. Reducing batch size to avoid OOM errors.")
        batch_size = min(batch_size, 10)
    print(f"🧠 Auto-selected batch size: {batch_size} (Estimated memory per sample: {est_mem_per_sample:.2f} GB, Total RAM: {total_memory_gb:.2f} GB)")

    batches = [df.iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    print(f"🔄 Total batches: {len(batches)}")

    print("📦 Processing batches:")
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        futures = {
            executor.submit(process_batch, batch, mode, preprocessor, extractor, force_update_preprocessing, i): i
            for i, batch in enumerate(batches)
            }
        for future in tqdm(as_completed(futures), total=len(futures), desc="📊 Batches Done"):
            batch_results = future.result()
            if batch_results:
                for idx, feat, label in batch_results:
                    features_dict[idx] = feat
                    labels_dict[idx] = label

    print("🔄 Finished processing batches.")
    # for i in range(len(batches)): extractor.remove_cached_features(mode, index=i)

    sorted_indices = sorted(features_dict.keys())
    X = np.array([features_dict[i] for i in sorted_indices])
    y = np.array([labels_dict[i] for i in sorted_indices])
    extractor.cache_features(X, y, version=version + 1, mode=mode, force_update=force_update_features)
    return X, y

In [7]:
X, y = prepare_features_parallel(df, mode="traditional", force_update_features=False, force_update_preprocessing=False) # , batch_size=250

🔄 Preparing features in traditional mode...


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
X.shape, y.shape

((172158, 147), (172158,))

# Models

In [9]:
# LightGBM
model = ModelPipeline(model=LightGBMModel)
metrics = model.train(X, y, use_optuna=True, n_trials=10, train_size=0.85, val_size=0.1)
metrics

[I 2025-05-05 04:48:51,817] A new study created in memory with name: no-name-f4193e22-865c-4b27-ba84-e3dfaecff8cf
[I 2025-05-05 04:50:42,472] Trial 1 finished with value: 0.828125 and parameters: {'learning_rate': 0.2196257654306618, 'num_leaves': 36, 'max_depth': 12}. Best is trial 1 with value: 0.828125.
[I 2025-05-05 04:50:59,335] Trial 4 finished with value: 0.6585153345724907 and parameters: {'learning_rate': 0.010608566573098189, 'num_leaves': 38, 'max_depth': 11}. Best is trial 1 with value: 0.828125.
[I 2025-05-05 04:51:05,096] Trial 7 finished with value: 0.7166589219330854 and parameters: {'learning_rate': 0.024830550597483316, 'num_leaves': 43, 'max_depth': 12}. Best is trial 1 with value: 0.828125.
[I 2025-05-05 04:51:51,408] Trial 0 finished with value: 0.8427044609665427 and parameters: {'learning_rate': 0.11599343496127819, 'num_leaves': 71, 'max_depth': 12}. Best is trial 0 with value: 0.8427044609665427.
[I 2025-05-05 04:51:52,721] Trial 8 finished with value: 0.703705

'              precision    recall  f1-score   support\n\n           0       0.94      0.90      0.92      5975\n           1       0.81      0.89      0.85       989\n           2       0.61      0.71      0.66       874\n           3       0.78      0.82      0.80       770\n\n    accuracy                           0.87      8608\n   macro avg       0.79      0.83      0.81      8608\nweighted avg       0.88      0.87      0.88      8608\n'

In [None]:
# XGBoost
model = ModelPipeline(model=XGBoostModel)
metrics = model.train(X, y, use_optuna=True, n_trials=10)
metrics

[I 2025-05-05 03:11:53,846] A new study created in memory with name: no-name-e90c4890-9f6e-4562-a646-940736c415da
[I 2025-05-05 03:13:38,816] Trial 1 finished with value: 0.8283573420074349 and parameters: {'learning_rate': 0.03153289111783931, 'max_depth': 8, 'n_estimators': 119, 'subsample': 0.7969329049076028, 'colsample_bytree': 0.5048800147939041, 'min_child_weight': 2}. Best is trial 1 with value: 0.8283573420074349.
[I 2025-05-05 03:14:06,913] Trial 0 finished with value: 0.8014637546468402 and parameters: {'learning_rate': 0.0024288482682295618, 'max_depth': 8, 'n_estimators': 159, 'subsample': 0.6623757693058729, 'colsample_bytree': 0.5478323744040681, 'min_child_weight': 10}. Best is trial 1 with value: 0.8283573420074349.
[I 2025-05-05 03:15:50,994] Trial 9 finished with value: 0.8838870817843866 and parameters: {'learning_rate': 0.07720111435762671, 'max_depth': 9, 'n_estimators': 288, 'subsample': 0.6615885828119653, 'colsample_bytree': 0.9453776353439376, 'min_child_weigh

'              precision    recall  f1-score   support\n\n           0       0.90      0.99      0.94     18009\n           1       0.84      0.86      0.85      2915\n           2       0.97      0.43      0.60      2579\n           3       0.88      0.76      0.82      2321\n\n    accuracy                           0.90     25824\n   macro avg       0.90      0.76      0.80     25824\nweighted avg       0.90      0.90      0.89     25824\n'

In [10]:
# CatBoost
model = ModelPipeline(model=CatBoostModel)
metrics = model.train(X, y, use_optuna=True, n_trials=10)
metrics

[I 2025-05-05 05:26:34,466] A new study created in memory with name: no-name-7fb5e60e-8fd7-4883-ab16-9297318a8746
[I 2025-05-05 05:26:55,381] Trial 0 finished with value: 0.7687616171003717 and parameters: {'iterations': 574, 'learning_rate': 0.05824000746304977, 'depth': 12, 'l2_leaf_reg': 4.2769925577537305e-06}. Best is trial 0 with value: 0.7687616171003717.


: 

LightGBM
precision    recall  f1-score   support

           0       0.96      1.00      0.97     11917
           1       0.98      0.98      0.98      1920
           2       0.99      0.71      0.83      1744
           3       0.98      0.96      0.97      1635

    accuracy                           0.96     17216
   macro avg       0.97      0.91      0.94     17216
weighted avg       0.96      0.96      0.96     17216

precision    recall  f1-score   support

           0       0.91      0.98      0.94     18009
           1       0.85      0.85      0.85      2915
           2       0.91      0.47      0.62      2579
           3       0.87      0.77      0.82      2321

    accuracy                           0.90     25824
   macro avg       0.88      0.77      0.81     25824
weighted avg       0.90      0.90      0.89     25824

precision    recall  f1-score   support

           0       0.94      0.90      0.92      5975
           1       0.81      0.89      0.85       989
           2       0.61      0.71      0.66       874
           3       0.78      0.82      0.80       770

    accuracy                           0.87      8608
   macro avg       0.79      0.83      0.81      8608
weighted avg       0.88      0.87      0.88      8608



XGBoost
precision    recall  f1-score   support

           0       0.91      0.99      0.95     18009
           1       0.86      0.87      0.86      2915
           2       0.97      0.48      0.64      2579
           3       0.89      0.79      0.84      2321

    accuracy                           0.91     25824
   macro avg       0.91      0.78      0.82     25824
weighted avg       0.91      0.91      0.90     25824

precision    recall  f1-score   support

           0       1.00      1.00      1.00     11917
           1       1.00      1.00      1.00      1920
           2       1.00      1.00      1.00      1744
           3       1.00      1.00      1.00      1635

    accuracy                           1.00     17216
   macro avg       1.00      1.00      1.00     17216
weighted avg       1.00      1.00      1.00     17216

precision    recall  f1-score   support

           0       0.90      0.99      0.94     18009
           1       0.84      0.86      0.85      2915
           2       0.97      0.43      0.60      2579
           3       0.88      0.76      0.82      2321

    accuracy                           0.90     25824
   macro avg       0.90      0.76      0.80     25824
weighted avg       0.90      0.90      0.89     25824


CatBoost


## Load Models

In [None]:
lightgbm_model = ModelPipeline(model=LightGBMModel)
lightgbm_model.load_model(best_metric="weighted avg_f1-score")
lightgbm_model.classification_report(X, y)

ValueError: No runs found in experiment 'None' with the specified criteria.

# Test Inference

In [None]:
# === Batch Inference Utility ===
def run_batch_inference(model, input_folder, output_path, sr=16000, feature_mode="traditional"):
    extractor = FeatureExtractor()
    preprocessor = AudioPreprocessor()
    results = []

    for file in Path(input_folder).rglob("*.wav"):
        y = preprocessor.preprocess(preprocessor.load_audio(str(file), sr=sr))
        if y is not None:
            x = extractor.extract(y, sr=sr, mode=feature_mode).reshape(1, -1)
            pred = model.predict(x)[0]
            results.append({"file": file.name, "prediction": pred})

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    print(f"✅ Batch inference saved to {output_path}")
