# Hyperparameters Tuning

In [1]:
import os

os.chdir("..")

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import polars as pl

from src.constants import RANDOM_STATE, TEST_SIZE, TRAIN_SAMPLE_SIZE
from src.datatypes import BaseSchema, BaseSchemaN, ExtendedSchema, TrainSchema, filepaths

In [5]:
# Alias
S = TrainSchema
E = ExtendedSchema

train_schema = BaseSchema.__annotations__ | TrainSchema.__annotations__

In [6]:
data = pl.scan_parquet(filepaths.train_unique, schema=train_schema, cast_options=pl.ScanCastOptions(integer_cast='upcast')).head(TRAIN_SAMPLE_SIZE)
data = data.cast({S.click_time: pl.Datetime('ms')})
data = data.sort(S.click_time, maintain_order=True)

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    data.collect(),
    test_size=TEST_SIZE,
    shuffle=False,
)
train: pl.DataFrame
test: pl.DataFrame

train = train.sort([BaseSchemaN.ip, BaseSchemaN.click_time], maintain_order=True)
test = test.sort([BaseSchemaN.ip, BaseSchemaN.click_time], maintain_order=True)

X_train: pl.DataFrame = train.drop(S.attributed_time, S.label())
X_test: pl.DataFrame = test.drop(S.attributed_time, S.label())

y_train: pl.DataFrame = train.select(S.label())
y_test: pl.DataFrame = test.select(S.label())

In [8]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

from src.feature_engineering import make_derived_columns

ordinal_columns = [S.ip, S.app, S.device, S.os, S.channel]
no_op_columns = list(ExtendedSchema.__annotations__.keys())
used_columns = ordinal_columns + no_op_columns

column_extender = FunctionTransformer(func=make_derived_columns)
ct_ordinal = ColumnTransformer(
    [
        ("raw", FunctionTransformer(), no_op_columns),
        ("ordinal_encoder", OrdinalEncoder(unknown_value=-1, handle_unknown='use_encoded_value', dtype=np.int16), ordinal_columns),
    ]
)
model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)

pipeline = Pipeline(
    [
        ("column_extender", column_extender),
        ("column_transformer", ct_ordinal),
        ("model", model),
    ]
)
pipeline.set_output(transform='polars')

0,1,2
,steps,"[('column_extender', ...), ('column_transformer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function mak...x7bff0b09a160>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('raw', ...), ('ordinal_encoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int16'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [1]:
param_grid = {
    "n_estimators": [250, 500],
    "class_weight": [None, "balanced"],
    "max_features": [3, 4, 5, 6],
    "min_samples_leaf": [20],
}

In [None]:
from dataclasses import dataclass, field
from sklearn import clone
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid


@dataclass(kw_only=True)
class ExperimentResult:
    hyperparameters: dict
    train_auc: float
    test_auc: float
    pipeline: RandomForestClassifier = field(repr=False)


experiment_results: list[ExperimentResult] = []

for _hyperparams in ParameterGrid(param_grid):
    print(f"Training model with hyperparameters: {_hyperparams}")
    _pipeline = clone(pipeline)
    _model: RandomForestClassifier = pipeline[-1]
    _model.set_params(**_hyperparams)
    
    _pipeline.fit(X_train, y_train)
    y_train_proba = _pipeline.predict_proba(X_train)[:, 1]
    y_test_proba  = _pipeline.predict_proba(X_test)[:, 1]

    _train_auc: float = roc_auc_score(y_train, y_train_proba)
    _test_auc: float = roc_auc_score(y_test,  y_test_proba)

    _experiment = ExperimentResult(
        hyperparameters=_hyperparams,
        train_auc=_train_auc,
        test_auc=_test_auc,
        pipeline=_pipeline
    )
    print(f"Current run finished:\n{_experiment}")
    experiment_results.append(_experiment)
    
experiment_results = sorted(experiment_results, key=lambda x: x.test_auc, reverse=True)
best_experiment = experiment_results[0]
print(f"Best experiment:]n{best_experiment}")

Training model with hyperparameters: {'class_weight': None, 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.9989776293271907, test_auc=0.9510746399886993)
Training model with hyperparameters: {'class_weight': None, 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.99828364271854, test_auc=0.9492180138623411)
Training model with hyperparameters: {'class_weight': None, 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.9982829527807061, test_auc=0.9496814842461228)
Training model with hyperparameters: {'class_weight': None, 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.9984079716003434, test_auc=0.9512577782352818)
Training model with hyperparameters: {'class_weight': None, 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.998413051646163, test_auc=0.9507912197237933)
Training model with hyperparameters: {'class_weight': None, 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.9984728371367553, test_auc=0.9488668348824096)
Training model with hyperparameters: {'class_weight': None, 'max_features': 6, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 6, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.9984754695152147, test_auc=0.9496741505338827)
Training model with hyperparameters: {'class_weight': None, 'max_features': 6, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': None, 'max_features': 6, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.998506818008885, test_auc=0.949731438684757)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': 'balanced', 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.9985117754824437, test_auc=0.949965120377755)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': 'balanced', 'max_features': 3, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.9989776293271908, test_auc=0.9510746399886993)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': 'balanced', 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.9989796530895387, test_auc=0.9489201697939741)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': 'balanced', 'max_features': 4, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.9990433659084053, test_auc=0.9464463019578402)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': 'balanced', 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 250}, train_auc=0.9990465970350471, test_auc=0.9469093422528687)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 500}


  return fit_method(estimator, *args, **kwargs)


Current run finished:
ExperimentResult(hyperparameters={'class_weight': 'balanced', 'max_features': 5, 'min_samples_leaf': 20, 'n_estimators': 500}, train_auc=0.9990810635008169, test_auc=0.9458958362555178)
Training model with hyperparameters: {'class_weight': 'balanced', 'max_features': 6, 'min_samples_leaf': 20, 'n_estimators': 250}


  return fit_method(estimator, *args, **kwargs)
