# BaseExperiments

In [1]:
import os

os.chdir("..")

In [None]:
import polars as pl

from src.constants import RANDOM_STATE, TRAIN_SAMPLE_SIZE
from src.datatypes import BaseSchema, TrainSchema, filepaths

In [3]:
# Alias
S = TrainSchema

train_schema = BaseSchema.__annotations__ | TrainSchema.__annotations__

In [4]:
data = pl.scan_parquet(filepaths.train, schema=train_schema, cast_options=pl.ScanCastOptions(integer_cast='upcast')).head(TRAIN_SAMPLE_SIZE).collect()
data_lf = data.lazy()

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

TEST_SIZE = 0.10

X = data_lf.drop(S.ip, S.label(), S.attributed_time, S.click_time)
y = data_lf.select(S.label())

In [None]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

used_columns = [S.app, S.device, S.os, S.channel]
ct_ordinal = ColumnTransformer(
    [
        ("ordinal_encoder", OrdinalEncoder(min_frequency=0.000008, unknown_value=-1, handle_unknown='use_encoded_value', dtype=np.int16), used_columns)
    ]
)
ct_onehot = ColumnTransformer(
    [
        ("onehot_encoder", OneHotEncoder(min_frequency=0.00001, handle_unknown='ignore', dtype=np.bool, sparse_output=False), [S.app, S.device, S.os, S.channel])
    ],
)

pipeline_ordinal = Pipeline(
    [
        ("column_transformer", ct_ordinal)
    ]
)
pipeline_ordinal.set_output(transform='polars')


pipeline_onehot = Pipeline(
    [
        ("column_transformer", ct_onehot)
    ]
)
pipeline_onehot.set_output(transform='polars')
# takes 40s, ~430 columns
# X_proc: pl.DataFrame = pipeline_onehot.fit_transform(X.collect())

0,1,2
,steps,"[('column_transformer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot_encoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.bool'>
,handle_unknown,'ignore'
,min_frequency,1e-05
,max_categories,
,feature_name_combiner,'concat'


I decided not to use models that rely on OneHot columns, because due to high number of columns my RAM goes over 9000.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X.collect(), y.collect(),
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y.collect()
)

In [8]:
import gc
from time import perf_counter
from typing import Any

from sklearn import clone
from sklearn.base import BaseEstimator


def do_experiment(classifiers: dict[str, tuple[Pipeline, BaseEstimator]]) -> list[dict[str, Any]]:
    results: list[dict[str, Any]] = []

    for classifier_name, (pipeline, clf) in classifiers.items():
        time_started = perf_counter()
        print(f"Training {classifier_name}...")
        pipeline = clone(pipeline)
        pipeline.steps.append(
            (classifier_name, clf)
        )
        pipeline.set_output(transform='polars')

        pipeline.fit(X_train, y_train)
    
        y_train_proba = pipeline.predict_proba(X_train)[:, 1]
        y_test_proba  = pipeline.predict_proba(X_test)[:, 1]
    
        auc_train: float = roc_auc_score(y_train, y_train_proba)
        auc_test: float = roc_auc_score(y_test,  y_test_proba)
    
        time_ended = perf_counter()
        time_taken = time_ended - time_started

        _result = {
        "Classifier": classifier_name,
        "AUC (Train)": auc_train,
        "AUC (Test)": auc_test,
        "Time taken": time_taken,
    }
        print(_result)
        print()
        results.append(_result)
        gc.collect()
    return results

# Experiment 1: Default hyperparameters

In [None]:
classifiers_1: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "DecisionTree": (pipeline_ordinal, DecisionTreeClassifier(random_state=RANDOM_STATE)),
    "RandomForest": (pipeline_ordinal, RandomForestClassifier(
        n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline_ordinal, AdaBoostClassifier(
        n_estimators=50, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline_ordinal, XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_1_results = do_experiment(classifiers_1)
print(pl.DataFrame(experiment_1_results))

Training DecisionTree...
{'Classifier': 'DecisionTree', 'AUC (Train)': 0.9779992602629549, 'AUC (Test)': 0.935006639946952, 'Time taken': 32.46685731299999}

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9779280159907966, 'AUC (Test)': 0.9477337855682034, 'Time taken': 436.9135770500001}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9269714655292639, 'AUC (Test)': 0.9214952808060439, 'Time taken': 149.03375881700003}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.6874446778038102, 'AUC (Test)': 0.6925844556854701, 'Time taken': 32.05036624600007}

shape: (4, 4)
┌──────────────┬─────────────┬────────────┬────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken │
│ ---          ┆ ---         ┆ ---        ┆ ---        │
│ str          ┆ f64         ┆ f64        ┆ f64        │
╞══════════════╪═════════════╪════════════╪════════════╡
│ DecisionTree ┆ 0.977999    ┆ 0.935007   ┆ 32.466857  │
│ RandomForest ┆ 0.977928    ┆ 0.947734   ┆ 436.913577 │
│ AdaBoost     ┆ 0.926971    ┆ 0.921495   ┆ 149.033759 │
│ XGBoost      ┆ 0.687445    ┆ 0.692584   ┆ 32.050366  │
└──────────────┴─────────────┴────────────┴────────────┘


In [29]:
experiment_1_results[0]['Hyperparameters'] = 'default'
experiment_1_results[1]['Hyperparameters'] = 'n_estimators=100'
experiment_1_results[2]['Hyperparameters'] = 'n_estimators=50'
experiment_1_results[3]['Hyperparameters'] = 'n_estimators=100'

# Experiment 2: Default hyperparameters 2x

In [10]:
classifiers_2: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline_ordinal, RandomForestClassifier(
        n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline_ordinal, AdaBoostClassifier(
        n_estimators=100, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline_ordinal, XGBClassifier(
        n_estimators=200,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_2_results = do_experiment(classifiers_2)
print(pl.DataFrame(experiment_2_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.97793920646442, 'AUC (Test)': 0.9479617608595885, 'Time taken': 860.0192983580001}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.934559837481279, 'AUC (Test)': 0.9282696388947766, 'Time taken': 294.105632024}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.7493822624659577, 'AUC (Test)': 0.7521011896407518, 'Time taken': 63.57696490199987}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken │
│ ---          ┆ ---         ┆ ---        ┆ ---        │
│ str          ┆ f64         ┆ f64        ┆ f64        │
╞══════════════╪═════════════╪════════════╪════════════╡
│ RandomForest ┆ 0.977939    ┆ 0.947962   ┆ 860.019298 │
│ AdaBoost     ┆ 0.93456     ┆ 0.92827    ┆ 294.105632 │
│ XGBoost      ┆ 0.749382    ┆ 0.752101   ┆ 63.576965  │
└──────────────┴─────────────┴────────────┴────────────┘


In [31]:
experiment_2_results[0]['Hyperparameters'] = 'n_estimators=200'
experiment_2_results[1]['Hyperparameters'] = 'n_estimators=100'
experiment_2_results[2]['Hyperparameters'] = 'n_estimators=200'

# Experiment 3: Default hyperparameters 3x

In [11]:
classifiers_3: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline_ordinal, RandomForestClassifier(
        n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline_ordinal, AdaBoostClassifier(
        n_estimators=150, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline_ordinal, XGBClassifier(
        n_estimators=300,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_3_results = do_experiment(classifiers_3)
print(pl.DataFrame(experiment_3_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9779412826935199, 'AUC (Test)': 0.9483099273263864, 'Time taken': 1286.846314917}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9394338954702874, 'AUC (Test)': 0.9345694161190964, 'Time taken': 429.4363373269998}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.7828127409535687, 'AUC (Test)': 0.7805573310615768, 'Time taken': 95.78262418099985}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 0.977941    ┆ 0.94831    ┆ 1286.846315 │
│ AdaBoost     ┆ 0.939434    ┆ 0.934569   ┆ 429.436337  │
│ XGBoost      ┆ 0.782813    ┆ 0.780557   ┆ 95.782624   │
└──────────────┴─────────────┴────────────┴─────────────┘


In [32]:
experiment_3_results[0]['Hyperparameters'] = 'n_estimators=300'
experiment_3_results[1]['Hyperparameters'] = 'n_estimators=150'
experiment_3_results[2]['Hyperparameters'] = 'n_estimators=300'

# Experiment 4: Default hyperparameters 4x

In [12]:
classifiers_4: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline_ordinal, RandomForestClassifier(
        n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline_ordinal, AdaBoostClassifier(
        n_estimators=200, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline_ordinal, XGBClassifier(
        n_estimators=400,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_4_results = do_experiment(classifiers_4)
print(pl.DataFrame(experiment_4_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.977942367129656, 'AUC (Test)': 0.9488703745740855, 'Time taken': 1703.541537866}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9403945637507991, 'AUC (Test)': 0.9354708659472665, 'Time taken': 567.5284954560002}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.8149289652865905, 'AUC (Test)': 0.8105505045333288, 'Time taken': 121.19032256699938}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 0.977942    ┆ 0.94887    ┆ 1703.541538 │
│ AdaBoost     ┆ 0.940395    ┆ 0.935471   ┆ 567.528495  │
│ XGBoost      ┆ 0.814929    ┆ 0.810551   ┆ 121.190323  │
└──────────────┴─────────────┴────────────┴─────────────┘


In [33]:
experiment_4_results[0]['Hyperparameters'] = 'n_estimators=400'
experiment_4_results[1]['Hyperparameters'] = 'n_estimators=200'
experiment_4_results[2]['Hyperparameters'] = 'n_estimators=400'

# Experiment 5: Default hyperparameters 5x

In [13]:
classifiers_5: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline_ordinal, RandomForestClassifier(
        n_estimators=500, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline_ordinal, AdaBoostClassifier(
        n_estimators=250, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline_ordinal, XGBClassifier(
        n_estimators=500,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_5_results = do_experiment(classifiers_5)
print(pl.DataFrame(experiment_5_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.977943761887393, 'AUC (Test)': 0.9491106793484663, 'Time taken': 2122.916024326999}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9422210902631721, 'AUC (Test)': 0.9372030878926004, 'Time taken': 705.6097684269989}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.8344045990238719, 'AUC (Test)': 0.8291526020571841, 'Time taken': 157.69365388900042}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 0.977944    ┆ 0.949111   ┆ 2122.916024 │
│ AdaBoost     ┆ 0.942221    ┆ 0.937203   ┆ 705.609768  │
│ XGBoost      ┆ 0.834405    ┆ 0.829153   ┆ 157.693654  │
└──────────────┴─────────────┴────────────┴─────────────┘


In [None]:
experiment_5_results[0]['Hyperparameters'] = 'n_estimators=500'
experiment_5_results[1]['Hyperparameters'] = 'n_estimators=250'
experiment_5_results[2]['Hyperparameters'] = 'n_estimators=500'

# Experiment 6: Previous hyperparams 2x Ada, XGBoost

In [14]:
classifiers_6: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "AdaBoost": (pipeline_ordinal, AdaBoostClassifier(
        n_estimators=500, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline_ordinal, XGBClassifier(
        n_estimators=1000,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_6_results = do_experiment(classifiers_6)
print(pl.DataFrame(experiment_6_results))

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9463478713710278, 'AUC (Test)': 0.9419614583512357, 'Time taken': 1471.7912652710002}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.8585022217875349, 'AUC (Test)': 0.852475397525574, 'Time taken': 336.87712608999936}

shape: (2, 4)
┌────────────┬─────────────┬────────────┬─────────────┐
│ Classifier ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---        ┆ ---         ┆ ---        ┆ ---         │
│ str        ┆ f64         ┆ f64        ┆ f64         │
╞════════════╪═════════════╪════════════╪═════════════╡
│ AdaBoost   ┆ 0.946348    ┆ 0.941961   ┆ 1471.791265 │
│ XGBoost    ┆ 0.858502    ┆ 0.852475   ┆ 336.877126  │
└────────────┴─────────────┴────────────┴─────────────┘


In [35]:
experiment_6_results[0]['Hyperparameters'] = 'n_estimators=500'
experiment_6_results[1]['Hyperparameters'] = 'n_estimators=1000'

In [None]:
for i, _experiment in enumerate((experiment_1_results, experiment_2_results, experiment_3_results, experiment_4_results, experiment_5_results, experiment_6_results), start=1):
    for a in _experiment:
        a['Experiment'] = i

# Save experiments

In [50]:
experiments = pl.concat((
    pl.DataFrame(experiment_1_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_2_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_3_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_4_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_5_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_6_results).select('Experiment', pl.all().exclude('Experiment')),
))
experiments = experiments.with_columns(
    pl.all(), used_columns=pl.lit(", ".join(used_columns)), preprocessing=pl.lit("min_frequency=0.000008, unknown_value=-1, handle_unknown='use_encoded_value', dtype=np.int16")
)
experiments

Experiment,Classifier,AUC (Train),AUC (Test),Time taken,Hyperparameters,used_columns,preprocessing
i64,str,f64,f64,f64,str,str,str
1,"""DecisionTree""",0.977999,0.935007,32.466857,"""default""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
1,"""RandomForest""",0.977928,0.947734,436.913577,"""n_estimators=100""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
1,"""AdaBoost""",0.926971,0.921495,149.033759,"""n_estimators=50""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
1,"""XGBoost""",0.687445,0.692584,32.050366,"""n_estimators=100""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
2,"""RandomForest""",0.977939,0.947962,860.019298,"""n_estimators=200""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
…,…,…,…,…,…,…,…
5,"""RandomForest""",0.977944,0.949111,2122.916024,"""n_estimators=500""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
5,"""AdaBoost""",0.942221,0.937203,705.609768,"""n_estimators=250""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
5,"""XGBoost""",0.834405,0.829153,157.693654,"""n_estimators=500""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"
6,"""AdaBoost""",0.946348,0.941961,1471.791265,"""n_estimators=500""","""app, device, os, channel""","""min_frequency=0.000008, unknow…"


In [51]:
from pathlib import Path

experiments_fp = Path("experiments/03_base_experiments.csv")
experiments_fp.parent.mkdir(parents=True, exist_ok=True)

experiments.write_csv(experiments_fp)