# Extended Columns Experiments

In [1]:
import os

os.chdir("..")

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import polars as pl

from src.constants import RANDOM_STATE, TRAIN_SAMPLE_SIZE
from src.datatypes import BaseSchema, BaseSchemaN, TrainSchema, ExtendedSchema, filepaths

In [5]:
# Alias
S = TrainSchema
E = ExtendedSchema

train_schema = BaseSchema.__annotations__ | TrainSchema.__annotations__

In [29]:
data = pl.scan_parquet(filepaths.train_unique, schema=train_schema, cast_options=pl.ScanCastOptions(integer_cast='upcast')).head(TRAIN_SAMPLE_SIZE)
data = data.cast({S.click_time: pl.Datetime('ms')})
data = data.sort(S.click_time, maintain_order=True)

In [30]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.10

train, test = train_test_split(
    data.collect(),
    test_size=TEST_SIZE,
    shuffle=False,
)
train: pl.DataFrame
test: pl.DataFrame

train = train.sort([BaseSchemaN.ip, BaseSchemaN.click_time], maintain_order=True)
test = test.sort([BaseSchemaN.ip, BaseSchemaN.click_time], maintain_order=True)

X_train: pl.DataFrame = train.drop(S.attributed_time, S.label())
X_test: pl.DataFrame = test.drop(S.attributed_time, S.label())

y_train: pl.DataFrame = train.select(S.label())
y_test: pl.DataFrame = test.select(S.label())

In [31]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import OrdinalEncoder

from src.feature_engineering import make_derived_columns


ordinal_columns = [S.app, S.device, S.os, S.channel]
no_op_columns = [S.ip] + list(ExtendedSchema.__annotations__.keys())

column_extender = FunctionTransformer(func=make_derived_columns)
ct_ordinal = ColumnTransformer(
    [
        ("raw", FunctionTransformer(), no_op_columns),
        ("ordinal_encoder", OrdinalEncoder(min_frequency=0.000008, unknown_value=-1, handle_unknown='use_encoded_value', dtype=np.int16), ordinal_columns),
    ]
)

pipeline = Pipeline(
    [
        ("column_extender", column_extender),
        ("column_transformer", ct_ordinal),
    ]
)
pipeline.set_output(transform='polars')

0,1,2
,steps,"[('column_extender', ...), ('column_transformer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function mak...x751507286160>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('raw', ...), ('ordinal_encoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int16'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,8e-06
,max_categories,


In [32]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('column_extender', ...), ('column_transformer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function mak...x751507286160>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('raw', ...), ('ordinal_encoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int16'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,8e-06
,max_categories,


# Experiment 1: Default hyperparameters

In [34]:
from sklearn.base import BaseEstimator
from src.utils import do_experiment

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

classifiers_1: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "DecisionTree": (pipeline, DecisionTreeClassifier(random_state=RANDOM_STATE)),
    "RandomForest": (pipeline, RandomForestClassifier(
        n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline, AdaBoostClassifier(
        n_estimators=50, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline, XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_1_results = do_experiment(classifiers_1, X_train, X_test, y_train, y_test)
print(pl.DataFrame(experiment_1_results))

Training DecisionTree...
{'Classifier': 'DecisionTree', 'AUC (Train)': 0.9999999994494709, 'AUC (Test)': 0.6996169746337058, 'Time taken': 223.7307301760011}

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9999999892487771, 'AUC (Test)': 0.9349589833237387, 'Time taken': 710.2273963589978}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9488684262442997, 'AUC (Test)': 0.9270630727163193, 'Time taken': 458.96459006497753}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.7608811919035203, 'AUC (Test)': 0.6234198970938565, 'Time taken': 48.431491732015274}

shape: (4, 4)
┌──────────────┬─────────────┬────────────┬────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken │
│ ---          ┆ ---         ┆ ---        ┆ ---        │
│ str          ┆ f64         ┆ f64        ┆ f64        │
╞══════════════╪═════════════╪════════════╪════════════╡
│ DecisionTree ┆ 1.0         ┆ 0.699617   ┆ 223.73073  │
│ RandomForest ┆ 1.0         ┆ 0.934959   ┆ 710.227396 │
│ AdaBoost     ┆ 0.948868    ┆ 0.927063   ┆ 458.96459  │
│ XGBoost      ┆ 0.760881    ┆ 0.62342    ┆ 48.431492  │
└──────────────┴─────────────┴────────────┴────────────┘


In [35]:
experiment_1_results[0]['Hyperparameters'] = 'default'
experiment_1_results[1]['Hyperparameters'] = 'n_estimators=100'
experiment_1_results[2]['Hyperparameters'] = 'n_estimators=50'
experiment_1_results[3]['Hyperparameters'] = 'n_estimators=100'

# Experiment 2: Default hyperparameters 2x

In [37]:
classifiers_2: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline, RandomForestClassifier(
        n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline, AdaBoostClassifier(
        n_estimators=100, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline, XGBClassifier(
        n_estimators=200,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_2_results = do_experiment(classifiers_2, X_train, X_test, y_train, y_test)
print(pl.DataFrame(experiment_2_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9999999985930922, 'AUC (Test)': 0.9399800509787243, 'Time taken': 1372.8703191719833}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9565080270805633, 'AUC (Test)': 0.9355253931480355, 'Time taken': 892.3324085449858}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.7778857046051666, 'AUC (Test)': 0.6436165084586147, 'Time taken': 68.84127705200808}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 1.0         ┆ 0.93998    ┆ 1372.870319 │
│ AdaBoost     ┆ 0.956508    ┆ 0.935525   ┆ 892.332409  │
│ XGBoost      ┆ 0.777886    ┆ 0.643617   ┆ 68.841277   │
└──────────────┴─────────────┴────────────┴─────────────┘


In [38]:
experiment_2_results[0]['Hyperparameters'] = 'n_estimators=200'
experiment_2_results[1]['Hyperparameters'] = 'n_estimators=100'
experiment_2_results[2]['Hyperparameters'] = 'n_estimators=200'

# Experiment 3: Default hyperparameters 3x

In [39]:
classifiers_3: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline, RandomForestClassifier(
        n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline, AdaBoostClassifier(
        n_estimators=150, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline, XGBClassifier(
        n_estimators=300,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_3_results = do_experiment(classifiers_3, X_train, X_test, y_train, y_test)
print(pl.DataFrame(experiment_3_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9999999993075567, 'AUC (Test)': 0.9409636945899534, 'Time taken': 2014.3922920830082}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9585582027216163, 'AUC (Test)': 0.9390343086304953, 'Time taken': 1323.5134536120167}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.7948593476585194, 'AUC (Test)': 0.643752862629213, 'Time taken': 86.87030696801958}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 1.0         ┆ 0.940964   ┆ 2014.392292 │
│ AdaBoost     ┆ 0.958558    ┆ 0.939034   ┆ 1323.513454 │
│ XGBoost      ┆ 0.794859    ┆ 0.643753   ┆ 86.870307   │
└──────────────┴─────────────┴────────────┴─────────────┘


In [40]:
experiment_3_results[0]['Hyperparameters'] = 'n_estimators=300'
experiment_3_results[1]['Hyperparameters'] = 'n_estimators=150'
experiment_3_results[2]['Hyperparameters'] = 'n_estimators=300'

# Experiment 4: Default hyperparameters 4x

In [41]:
classifiers_4: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline, RandomForestClassifier(
        n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline, AdaBoostClassifier(
        n_estimators=200, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline, XGBClassifier(
        n_estimators=400,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_4_results = do_experiment(classifiers_4, X_train, X_test, y_train, y_test)
print(pl.DataFrame(experiment_4_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9999999994103219, 'AUC (Test)': 0.9401421187825566, 'Time taken': 2643.886932716996}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9601530209828827, 'AUC (Test)': 0.9419085093280223, 'Time taken': 1754.381344435009}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.8065564728797541, 'AUC (Test)': 0.649300027922546, 'Time taken': 101.1779289249971}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 1.0         ┆ 0.940142   ┆ 2643.886933 │
│ AdaBoost     ┆ 0.960153    ┆ 0.941909   ┆ 1754.381344 │
│ XGBoost      ┆ 0.806556    ┆ 0.6493     ┆ 101.177929  │
└──────────────┴─────────────┴────────────┴─────────────┘


In [42]:
experiment_4_results[0]['Hyperparameters'] = 'n_estimators=400'
experiment_4_results[1]['Hyperparameters'] = 'n_estimators=200'
experiment_4_results[2]['Hyperparameters'] = 'n_estimators=400'

# Experiment 5: Default hyperparameters 5x

In [43]:
classifiers_5: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "RandomForest": (pipeline, RandomForestClassifier(
        n_estimators=500, random_state=RANDOM_STATE, n_jobs=-1,
    )),
    "AdaBoost": (pipeline, AdaBoostClassifier(
        n_estimators=250, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline, XGBClassifier(
        n_estimators=500,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_5_results = do_experiment(classifiers_5, X_train, X_test, y_train, y_test)
print(pl.DataFrame(experiment_5_results))

Training RandomForest...


  return fit_method(estimator, *args, **kwargs)


{'Classifier': 'RandomForest', 'AUC (Train)': 0.9999999994152157, 'AUC (Test)': 0.9409342624981722, 'Time taken': 3293.1732050589926}

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9612093387987344, 'AUC (Test)': 0.9437411647907689, 'Time taken': 2183.7952361149946}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.8148056813699949, 'AUC (Test)': 0.6542195440572713, 'Time taken': 115.42064757301705}

shape: (3, 4)
┌──────────────┬─────────────┬────────────┬─────────────┐
│ Classifier   ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---          ┆ ---         ┆ ---        ┆ ---         │
│ str          ┆ f64         ┆ f64        ┆ f64         │
╞══════════════╪═════════════╪════════════╪═════════════╡
│ RandomForest ┆ 1.0         ┆ 0.940934   ┆ 3293.173205 │
│ AdaBoost     ┆ 0.961209    ┆ 0.943741   ┆ 2183.795236 │
│ XGBoost      ┆ 0.814806    ┆ 0.65422    ┆ 115.420648  │
└──────────────┴─────────────┴────────────┴─────────────┘


In [44]:
experiment_5_results[0]['Hyperparameters'] = 'n_estimators=500'
experiment_5_results[1]['Hyperparameters'] = 'n_estimators=250'
experiment_5_results[2]['Hyperparameters'] = 'n_estimators=500'

# Experiment 6: Previous hyperparams 2x Ada, XGBoost

In [45]:
classifiers_6: dict[str, tuple[Pipeline, BaseEstimator]] = {
    "AdaBoost": (pipeline, AdaBoostClassifier(
        n_estimators=500, random_state=RANDOM_STATE,
    )),
    "XGBoost": (pipeline, XGBClassifier(
        n_estimators=1000,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )),
}

experiment_6_results = do_experiment(classifiers_6, X_train, X_test, y_train, y_test)
print(pl.DataFrame(experiment_6_results))

Training AdaBoost...


  y = column_or_1d(y, warn=True)


{'Classifier': 'AdaBoost', 'AUC (Train)': 0.9650768878802378, 'AUC (Test)': 0.9485021046117654, 'Time taken': 4375.039142928988}

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Classifier': 'XGBoost', 'AUC (Train)': 0.8535195136763571, 'AUC (Test)': 0.6667997286983698, 'Time taken': 226.85871689300984}

shape: (2, 4)
┌────────────┬─────────────┬────────────┬─────────────┐
│ Classifier ┆ AUC (Train) ┆ AUC (Test) ┆ Time taken  │
│ ---        ┆ ---         ┆ ---        ┆ ---         │
│ str        ┆ f64         ┆ f64        ┆ f64         │
╞════════════╪═════════════╪════════════╪═════════════╡
│ AdaBoost   ┆ 0.965077    ┆ 0.948502   ┆ 4375.039143 │
│ XGBoost    ┆ 0.85352     ┆ 0.6668     ┆ 226.858717  │
└────────────┴─────────────┴────────────┴─────────────┘


In [46]:
experiment_6_results[0]['Hyperparameters'] = 'n_estimators=500'
experiment_6_results[1]['Hyperparameters'] = 'n_estimators=1000'

In [254]:
for i, _experiment in enumerate((experiment_1_results, experiment_2_results, experiment_3_results, experiment_4_results, experiment_5_results, experiment_6_results), start=1):
    for a in _experiment:
        a['Experiment'] = i

# Save experiments

In [257]:
used_columns = ordinal_columns + no_op_columns

In [258]:
experiments = pl.concat((
    pl.DataFrame(experiment_1_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_2_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_3_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_4_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_5_results).select('Experiment', pl.all().exclude('Experiment')), 
    pl.DataFrame(experiment_6_results).select('Experiment', pl.all().exclude('Experiment')),
))
experiments = experiments.with_columns(
    pl.all(), used_columns=pl.lit(", ".join(used_columns)), preprocessing=pl.lit("min_frequency=0.000008, unknown_value=-1, handle_unknown='use_encoded_value', dtype=np.int16")
)
experiments

Experiment,Classifier,AUC (Train),AUC (Test),Time taken,Hyperparameters,used_columns,preprocessing
i64,str,f64,f64,f64,str,str,str
1,"""DecisionTree""",1.0,0.699617,223.73073,"""default""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
1,"""RandomForest""",1.0,0.934959,710.227396,"""n_estimators=100""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
1,"""AdaBoost""",0.948868,0.927063,458.96459,"""n_estimators=50""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
1,"""XGBoost""",0.760881,0.62342,48.431492,"""n_estimators=100""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
2,"""RandomForest""",1.0,0.93998,1372.870319,"""n_estimators=200""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
…,…,…,…,…,…,…,…
5,"""RandomForest""",1.0,0.940934,3293.173205,"""n_estimators=500""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
5,"""AdaBoost""",0.961209,0.943741,2183.795236,"""n_estimators=250""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
5,"""XGBoost""",0.814806,0.65422,115.420648,"""n_estimators=500""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"
6,"""AdaBoost""",0.965077,0.948502,4375.039143,"""n_estimators=500""","""app, device, os, channel, ip, …","""min_frequency=0.000008, unknow…"


In [259]:
from pathlib import Path

experiments_fp = Path("experiments/05_extended_columns.csv")
experiments_fp.parent.mkdir(parents=True, exist_ok=True)

experiments.write_csv(experiments_fp)

# Feature importance for AdaBoost250, AdaBoost500, RandomForest400 & RandomForest500

In [None]:
ada250_df = pl.DataFrame({
    "importance": classifiers_5['AdaBoost'][1].feature_importances_, 
    "feature": classifiers_5['AdaBoost'][1].feature_names_in_}
)
ada500_df = pl.DataFrame({
    "importance": classifiers_6['AdaBoost'][1].feature_importances_, 
    "feature": classifiers_6['AdaBoost'][1].feature_names_in_},
)
rf400_df = pl.DataFrame({
    "importance": classifiers_4['RandomForest'][1].feature_importances_, 
    "feature": classifiers_4['RandomForest'][1].feature_names_in_}
)
rf500_df = pl.DataFrame({
    "importance": classifiers_5['RandomForest'][1].feature_importances_, 
    "feature": classifiers_5['RandomForest'][1].feature_names_in_}
)

combined_last_two_experiments_df = pl.concat([
    rf500_df.with_columns(pl.lit('RandomForest500').alias('classifier')),
    rf400_df.with_columns(pl.lit('RandomForest400').alias('classifier')),
    ada500_df.with_columns(pl.lit('AdaBoost500').alias('classifier')),
    ada250_df.with_columns(pl.lit('AdaBoost250').alias('classifier')),
])
combined_last_two_experiments_df.plot.bar(x="importance:Q", y="feature:N", color='classifier:N', yOffset='classifier:N').properties(width=700, height=500)