In [1]:
%cd -q '../'

# %pwd

In [2]:
import pickle
# reload(hp)
from importlib import reload

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
# reload(src.config);
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (MinMaxScaler, OneHotEncoder, OrdinalEncoder,
                                   RobustScaler, StandardScaler)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

from notebooks.config import *

# %matplotlib inline

## Loading intermediary files from artifacts

In [3]:
stage_name = "4_PostFEAnalysis"

with open(
    file=f"notebooks/artifacts/{stage_name}_features_info.pkl", mode="rb"
) as f_write:
    features_info = pickle.load(f_write)

with open(
    file=f"notebooks/artifacts/{stage_name}_df_train.pkl", mode="rb"
) as f_write:
    df_train = pickle.load(f_write)

with open(
    file=f"notebooks/artifacts/{stage_name}_df_test.pkl", mode="rb"
) as f_write:
    df_test = pickle.load(f_write)

del stage_name

In [19]:
X_train = df_train.drop(LABEL, axis=1)
X_test = df_test.drop(LABEL, axis=1)

y_train = np.log1p(df_train[LABEL])
y_test = np.log1p(df_test[LABEL])

ct = ColumnTransformer(
        [
            # ("numerical", MinMaxScaler(), features_info["numerical"]),
            ("numerical", "passthrough", features_info["numerical"]),
            # ("numerical", RobustScaler(), make_column_selector("numerical__")),
            ("binary", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), features_info['binary']),
            # ("binary", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), make_column_selector(pattern='binary__')),
            ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), features_info["ordinal"]),
            # ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), make_column_selector(pattern='ordinal__')),
            ("nominal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=np.int16), features_info["nominal"])
            # ("nominal", OneHotEncoder(handle_unknown='ignore', dtype=np.int8, sparse_output=False), features_info["nominal"])
        ],
        remainder="drop",
        verbose_feature_names_out=False # False because prefixes are added manually
    ).set_output(transform="pandas")

rf_classifier = RandomForestRegressor(n_jobs=-1, random_state=RANDOM_SEED)
rf_classifier.estimator_ = DecisionTreeRegressor(random_state=RANDOM_SEED)

for model_name, classif in {
    "dummy_mean": DummyRegressor(strategy="mean"),
    "dummy_median": DummyRegressor(strategy="median"),
    "ridge": Ridge(random_state=RANDOM_SEED), 
    "svr": SVR(), 
    "knn": KNeighborsRegressor(n_jobs=-1), 
    "dt": DecisionTreeRegressor(random_state=RANDOM_SEED), 
    "ada": AdaBoostRegressor(random_state=RANDOM_SEED), 
    "rf": rf_classifier,
    "xgb": xgb.XGBRegressor()
}.items():
    pipe = Pipeline([
        ("ct", ct),
        ('classifier', classif)
    ])

    pipe.fit(X_train, y_train)

    y_pred_linreg = pipe.predict(X_test).reshape(-1, 1)
    y_pred_linreg_exp = np.expm1(y_pred_linreg)

    print(f"{model_name}: {np.sqrt(mean_squared_error(y_test, y_pred_linreg))}")

logreg: 0.1596607105381587
svr: 0.1930583234773488
knn: 0.2046587647626333
dt: 0.20878217440215482
ada: 0.16506949571123777
rf: 0.1532684484106143
xgb: 0.15700231955766428
