# Model Training

## Выборочные статистики

### Baseline

In [7]:
import joblib
import librosa
import json
import pandas as pd
import optuna
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import seaborn as sns
from typing import Any, Sequence
from utils import RAW_DATA_PATH, DATA_PATH, CSV_PATH
from sklearn.pipeline import Pipeline
from numpy.typing import NDArray

type FloatArray = NDArray[np.floating[Any]]

sns.set_style("whitegrid")

In [None]:
train_df = pd.read_feather(RAW_DATA_PATH / 'train.feather.lz4')

In [None]:
target = 'Pronunciation'
X_train, y_train = train_df.drop(columns=[target]), train_df[target]

In [11]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [6]:
import importlib
import baseline_transformer as btf
importlib.reload(btf)

categorical_features = ['Word ID']
numeric_features = X_train.drop(columns=categorical_features).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('btf', btf.BaselineTransformer(), numeric_features),
    ]
)

In [11]:
metrics = pd.DataFrame(index=['auc_pr'], columns=['Dummy', 'LogisticRegression', 'RandomForest', 'XGBoost', 'CatBoost'])
metrics['Dummy'] = (y_train > 0).sum() / len(X_train)
metrics

Unnamed: 0,Dummy,LogisticRegression,RandomForest,XGBoost,CatBoost
auc_pr,0.143026,,,,


In [None]:
from sklearn.preprocessing import StandardScaler


def objective(trial: optuna.Trial) -> float:
    C = trial.suggest_float('C', 1e-5, 1e5, log=True)
    max_iter = trial.suggest_int('max_iter', 3000, 5000)
    k = trial.suggest_int('k', 100, 300)

    log_reg_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('standardize', StandardScaler()),
        ('selector', SelectKBest(k=k)),
        ('classifier', LogisticRegression(C=C, max_iter=max_iter, class_weight='balanced', n_jobs=-1)),
    ])

    log_reg_pipeline.fit(X_train_, y_train_)
    trial.set_user_attr("model", log_reg_pipeline)

    y_pred = log_reg_pipeline.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_log_reg = study.best_trial.user_attrs["model"]

y_pred = best_log_reg.predict_proba(X_val)[:, 1]
metrics['LogisticRegression'] = average_precision_score(y_val, y_pred)
metrics

[I 2025-04-24 16:57:50,204] A new study created in memory with name: no-name-5d15aa5c-9339-45c3-9673-903125eac618
[I 2025-04-24 16:58:11,947] Trial 0 finished with value: 0.24268868637021523 and parameters: {'C': 10251.117794754602, 'max_iter': 4805, 'k': 166}. Best is trial 0 with value: 0.24268868637021523.
[I 2025-04-24 16:58:31,281] Trial 1 finished with value: 0.23452529207291806 and parameters: {'C': 99132.11476085639, 'max_iter': 4340, 'k': 142}. Best is trial 0 with value: 0.24268868637021523.
[I 2025-04-24 17:04:45,209] Trial 2 finished with value: 0.27100842287241084 and parameters: {'C': 13334.648885594384, 'max_iter': 3446, 'k': 263}. Best is trial 2 with value: 0.27100842287241084.
[I 2025-04-24 17:05:04,646] Trial 3 finished with value: 0.24639376347679082 and parameters: {'C': 1.4625062091225385, 'max_iter': 4367, 'k': 162}. Best is trial 2 with value: 0.27100842287241084.
[I 2025-04-24 17:05:23,797] Trial 4 finished with value: 0.25765637704643485 and parameters: {'C': 

Unnamed: 0,Dummy,LogisticRegression,RandomForest,XGBoost,CatBoost
auc_pr,0.143026,0.279926,,,


In [None]:
from sklearn.ensemble import RandomForestClassifier

def objective(trial: optuna.Trial) -> float:
    max_depth = trial.suggest_int('max_iter', 5, 50)
    n_estimators = trial.suggest_int('max_iter', 20, 100)
    k = trial.suggest_int('k', 100, 300)

    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.3, 0.5, 0.7])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    forest_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('standardize', StandardScaler()),
        ('selector', SelectKBest(k=k)),
        ('classifier', RandomForestClassifier(
            max_depth=max_depth, 
            n_jobs=-1, 
            class_weight='balanced', 
            n_estimators=n_estimators,
            max_features=max_features,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf)),
    ])

    forest_pipeline.fit(X_train_, y_train_)
    trial.set_user_attr("model", forest_pipeline)

    y_pred = forest_pipeline.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2025-04-24 10:02:02,207] A new study created in memory with name: no-name-0d98c5ae-3173-4b59-b51b-3bf2de01cdb5
[I 2025-04-24 10:02:20,502] Trial 0 finished with value: 0.25301297938030143 and parameters: {'max_iter': 25, 'k': 112, 'max_features': 0.3, 'min_samples_split': 7, 'min_samples_leaf': 11}. Best is trial 0 with value: 0.25301297938030143.
[I 2025-04-24 10:02:38,054] Trial 1 finished with value: 0.26596898482286163 and parameters: {'max_iter': 6, 'k': 161, 'max_features': 0.7, 'min_samples_split': 18, 'min_samples_leaf': 15}. Best is trial 1 with value: 0.26596898482286163.
[I 2025-04-24 10:02:55,335] Trial 2 finished with value: 0.31126774246449385 and parameters: {'max_iter': 28, 'k': 259, 'max_features': 0.5, 'min_samples_split': 3, 'min_samples_leaf': 14}. Best is trial 2 with value: 0.31126774246449385.
[I 2025-04-24 10:03:12,337] Trial 3 finished with value: 0.26183123767654615 and parameters: {'max_iter': 23, 'k': 238, 'max_features': 'log2', 'min_samples_split': 16, 

KeyboardInterrupt: 

In [None]:
best_forest = study.best_trial.user_attrs["model"]

y_pred = best_forest.predict_proba(X_val)[:, 1]
metrics['RandomForest'] = average_precision_score(y_val, y_pred)
metrics

Unnamed: 0,Dummy,LogisticRegression,RandomForest,XGBoost,CatBoost
auc_pr,0.143026,,0.306642,,


In [None]:
from xgboost import XGBClassifier

def objective(trial: optuna.Trial) -> float:
    max_depth = trial.suggest_int('max_iter', 5, 50)
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    k = trial.suggest_int('k', 100, 300)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_float('gamma', 0, 5)

    xgb_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('selector', SelectKBest(k=k)),
        ('classifier', XGBClassifier(
            max_depth=max_depth,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            min_child_weight=min_child_weight,
            gamma=gamma,
            eval_metric='logloss',
            n_jobs=-1
        )),
    ])

    xgb_pipeline.fit(X_train_, y_train_)
    trial.set_user_attr("model", xgb_pipeline)

    y_pred = xgb_pipeline.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_xgb = study.best_trial.user_attrs["model"]

y_pred = best_xgb.predict_proba(X_val)[:, 1]
metrics['XGBoost'] = average_precision_score(y_val, y_pred)
metrics

[I 2025-04-24 01:59:01,637] A new study created in memory with name: no-name-de8d8b91-bdb9-4baa-a114-ad8fce0abe00
[I 2025-04-24 01:59:20,301] Trial 0 finished with value: 0.2510038780259412 and parameters: {'max_iter': 29, 'n_estimators': 201, 'learning_rate': 0.013896389498100903, 'k': 188, 'subsample': 0.5213797120458701, 'colsample_bytree': 0.9179400026754023, 'min_child_weight': 6, 'gamma': 0.5097243069986784}. Best is trial 0 with value: 0.2510038780259412.
[I 2025-04-24 01:59:38,544] Trial 1 finished with value: 0.2572292831401991 and parameters: {'max_iter': 15, 'n_estimators': 133, 'learning_rate': 0.01761037553053986, 'k': 169, 'subsample': 0.79525016450707, 'colsample_bytree': 0.5041528089576843, 'min_child_weight': 8, 'gamma': 3.761582527341921}. Best is trial 1 with value: 0.2572292831401991.
[I 2025-04-24 01:59:57,261] Trial 2 finished with value: 0.2658321042980574 and parameters: {'max_iter': 24, 'n_estimators': 293, 'learning_rate': 0.06691429009454082, 'k': 132, 'subsa

Unnamed: 0,Dummy,LogisticRegression,RandomForest,XGBoost,CatBoost
auc_pr,0.143026,,0.306642,0.279541,


In [None]:
from catboost import CatBoostClassifier


def objective(trial: optuna.Trial) -> float:
    depth = trial.suggest_int('depth', 4, 10)
    iterations = trial.suggest_int('iterations', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    k = trial.suggest_int('k', 100, 300)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1e-2, 10.0, log=True)
    bagging_temperature = trial.suggest_float('bagging_temperature', 0.0, 1.0)

    catboost_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('selector', SelectKBest(k=k)),
        ('classifier', CatBoostClassifier(
            depth=depth,
            iterations=iterations,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            bagging_temperature=bagging_temperature,
            verbose=0,
            eval_metric='Logloss',
            thread_count=-1
        )),
    ])

    catboost_pipeline.fit(X_train_, y_train_)
    trial.set_user_attr("model", catboost_pipeline)

    y_pred = catboost_pipeline.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

best_catboost = study.best_trial.user_attrs["model"]

y_pred = best_catboost.predict_proba(X_val)[:, 1]
metrics['CatBoost'] = average_precision_score(y_val, y_pred)
metrics


[I 2025-04-24 10:05:06,331] A new study created in memory with name: no-name-5a56b8c9-0376-4dda-a7c9-8194cf6924e0
[I 2025-04-24 10:05:27,171] Trial 0 finished with value: 0.25796128468608587 and parameters: {'depth': 8, 'iterations': 216, 'learning_rate': 0.11932286511631263, 'k': 158, 'l2_leaf_reg': 4.472944085952919, 'bagging_temperature': 0.888220556274977}. Best is trial 0 with value: 0.25796128468608587.
[I 2025-04-24 10:05:46,083] Trial 1 finished with value: 0.252859450255697 and parameters: {'depth': 7, 'iterations': 89, 'learning_rate': 0.2710959753985784, 'k': 273, 'l2_leaf_reg': 0.5870308645113466, 'bagging_temperature': 0.528271354822106}. Best is trial 0 with value: 0.25796128468608587.
[I 2025-04-24 10:06:08,026] Trial 2 finished with value: 0.27824276665562975 and parameters: {'depth': 9, 'iterations': 184, 'learning_rate': 0.14679752633464507, 'k': 128, 'l2_leaf_reg': 0.5638371181468764, 'bagging_temperature': 0.9234763385638954}. Best is trial 2 with value: 0.278242766

Unnamed: 0,Dummy,LogisticRegression,RandomForest,XGBoost,CatBoost
auc_pr,0.143026,,0.306642,0.279541,0.285581


In [36]:
metrics

Unnamed: 0,Dummy,LogisticRegression,RandomForest,XGBoost,CatBoost
baseline,0.143026,0.235981,0.306642,0.279541,0.285581


## Улучшение модели

In [None]:
# --- 0. зависимости ---
# !pip install opensmile==2.5.0 optuna catboost

import importlib, optuna
import numpy as np
import pandas as pd
import baseline_transformer as btf
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.base import BaseEstimator, TransformerMixin

# --- 1. OpenSMILE-трансформер ---
import opensmile as sm
_smile = sm.Smile(
    feature_set=sm.FeatureSet.eGeMAPSv02,
    feature_level=sm.FeatureLevel.Functionals        # → 88 признаков
)

class OpenSmileTransformer(BaseEstimator, TransformerMixin):
    """ (Audio, SR)  →  матрица (n_samples, 88) """
    def fit(self, X, y=None):
        return self
    def transform(self, X: pd.DataFrame) -> np.ndarray:
        feats = []
        for audio, sr in zip(X["Audio"], X["SR"]):
            df = _smile.process_signal(audio, sr)
            feats.append(df.values.squeeze())
        return np.vstack(feats).astype(np.float32)

# --- 2. разбор колонок ---
categorical_features = ['Word ID']          # как и было
audio_cols           = ['Audio', 'SR']      # нужны обоим аудио-блокам
numeric_other        = (
    X_train
    .drop(columns=categorical_features + audio_cols)
    .columns
    .tolist()
)

# --- 3. общий препроцессор ---
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',   OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num',   'passthrough',                           numeric_other),
        ('btf',   btf.BaselineTransformer(),              audio_cols),
        ('smile', OpenSmileTransformer(),                 audio_cols),
    ],
    remainder='drop'      # всё нужное и так перечислили
)

# --- 4. Optuna-цель (+ мелкий фикс названий гиперпараметров) ---
def objective(trial: optuna.Trial) -> float:
    max_depth          = trial.suggest_int('max_depth', 5, 50)
    n_estimators       = trial.suggest_int('n_estimators', 20, 100)
    k                  = trial.suggest_int('k', 100, 400)
    max_features       = trial.suggest_categorical(
                            'max_features', ['sqrt', 'log2', 0.3, 0.5, 0.7])
    min_samples_split  = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf   = trial.suggest_int('min_samples_leaf', 1, 20)

    forest_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        # масштабирем только непризнаковые столбцы
        ('selector',     SelectKBest(score_func=f_classif, k=k)),
        ('classifier',   RandomForestClassifier(
                            max_depth=max_depth,
                            n_estimators=n_estimators,
                            max_features=max_features,
                            min_samples_split=min_samples_split,
                            min_samples_leaf=min_samples_leaf,
                            n_jobs=-1,
                            class_weight='balanced',
                            random_state=42
                        )),
    ])

    forest_pipeline.fit(X_train_, y_train_)
    y_pred = forest_pipeline.predict_proba(X_val)[:, 1]
    score  = average_precision_score(y_val, y_pred)
    trial.set_user_attr("model", forest_pipeline)
    return score

study = optuna.create_study(storage="sqlite:///optuna_study.db", direction='maximize')
study.optimize(objective, n_trials=20)
print('Лучшее PR-AUC:', study.best_value)
print('Гиперпараметры:', study.best_params)


[I 2025-04-24 20:12:04,190] A new study created in memory with name: no-name-9fb75d10-6fef-4d91-a837-4dffed02480f
[I 2025-04-24 20:13:40,818] Trial 0 finished with value: 0.2567013207877382 and parameters: {'max_depth': 31, 'n_estimators': 86, 'k': 222, 'max_features': 'sqrt', 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.2567013207877382.
[I 2025-04-24 20:15:16,444] Trial 1 finished with value: 0.29561026142147373 and parameters: {'max_depth': 41, 'n_estimators': 51, 'k': 282, 'max_features': 0.5, 'min_samples_split': 3, 'min_samples_leaf': 14}. Best is trial 1 with value: 0.29561026142147373.
[I 2025-04-24 20:16:50,730] Trial 2 finished with value: 0.255652631911168 and parameters: {'max_depth': 17, 'n_estimators': 24, 'k': 174, 'max_features': 0.5, 'min_samples_split': 12, 'min_samples_leaf': 17}. Best is trial 1 with value: 0.29561026142147373.
[I 2025-04-24 20:18:25,598] Trial 3 finished with value: 0.28567049069786465 and parameters: {'max_depth':

Лучшее PR-AUC: 0.3188345601535713
Гиперпараметры: {'max_depth': 7, 'n_estimators': 100, 'k': 300, 'max_features': 0.3, 'min_samples_split': 2, 'min_samples_leaf': 6}


In [24]:
from all_stats_transformer import AllStatsTransformer
from catboost import CatBoostClassifier

def objective(trial: optuna.Trial) -> float:
    preprocessor = ColumnTransformer(
    transformers=[
        ('btf', AllStatsTransformer(n_mfcc=50), numeric_features),
    ])

    max_depth          = trial.suggest_int('max_depth', 5, 50)
    n_estimators       = trial.suggest_int('n_estimators', 20, 100)
    k                  = trial.suggest_int('k', 300, 2000)
    max_features       = trial.suggest_categorical(
                            'max_features', ['sqrt', 'log2', 0.3, 0.5, 0.7])
    min_samples_split  = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf   = trial.suggest_int('min_samples_leaf', 1, 20)

    forest_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        # масштабирем только непризнаковые столбцы
        ('selector',     SelectKBest(k=k)),
        ('classifier',   RandomForestClassifier(
                            max_depth=max_depth,
                            n_estimators=n_estimators,
                            max_features=max_features,
                            min_samples_split=min_samples_split,
                            min_samples_leaf=min_samples_leaf,
                            n_jobs=-1,
                            class_weight='balanced',
                            random_state=42
                        )),
    ])

    forest_pipeline.fit(X_train_, y_train_)
    y_pred = forest_pipeline.predict_proba(X_val)[:, 1]
    score  = average_precision_score(y_val, y_pred)
    trial.set_user_attr("model", forest_pipeline)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)
print('Лучшее PR-AUC:', study.best_value)
print('Гиперпараметры:', study.best_params)

[I 2025-04-25 11:39:43,379] A new study created in memory with name: no-name-00fc4a59-4197-41a0-8473-46599f59de65
  f = msb / msw
[I 2025-04-25 11:40:13,623] Trial 0 finished with value: 0.20373352704671743 and parameters: {'max_depth': 39, 'n_estimators': 77, 'k': 1849, 'max_features': 'sqrt', 'min_samples_split': 19, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.20373352704671743.
  f = msb / msw
[I 2025-04-25 11:40:42,858] Trial 1 finished with value: 0.23144022661095204 and parameters: {'max_depth': 9, 'n_estimators': 21, 'k': 1273, 'max_features': 0.5, 'min_samples_split': 14, 'min_samples_leaf': 19}. Best is trial 1 with value: 0.23144022661095204.
  f = msb / msw
[I 2025-04-25 11:41:12,920] Trial 2 finished with value: 0.19625040803867505 and parameters: {'max_depth': 8, 'n_estimators': 41, 'k': 1053, 'max_features': 'log2', 'min_samples_split': 3, 'min_samples_leaf': 19}. Best is trial 1 with value: 0.23144022661095204.
  f = msb / msw
[I 2025-04-25 11:41:42,154] Trial 

Лучшее PR-AUC: 0.2609897596823534
Гиперпараметры: {'max_depth': 31, 'n_estimators': 68, 'k': 1497, 'max_features': 0.3, 'min_samples_split': 14, 'min_samples_leaf': 14}
