# Model training

## Settings

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import helpers.settings as sts
sts.print_settings(sts)

[1m[91mBEST_ESTIMATOR_FILENAME : best_estimator_0.0.1.pkl
[1m[91mDATASET_TRAIN_FILENAME : dataset_train.parquet
[1m[91mDATASET_VALIDATION_FILENAME : dataset_validation.parquet
[1m[91mETL_VERSION : 0.0.1
[1m[91mMODEL_FILENAME : model.pkl
[1m[91mMODEL_VERSION : 0.0.1
[1m[91mPREPROCESSOR_FILENAME : preprocessor_0.0.1.pkl
[1m[91mTRAINED_BEST_ESTIMATOR_FILENAME : trained_best_estimator_0.0.1.pkl
[1m[91mcolor : <class 'helpers.settings.color'>
[1m[91mprint_settings : <function print_settings at 0x7f891b1b1c10>
[0m


## Imports

In [4]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import (
    classification_report,
    plot_precision_recall_curve,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint
import shap

## Load train dataset

In [5]:
df_train = pd.read_parquet(f"data/{sts.DATASET_TRAIN_FILENAME}")

In [6]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

## Best estimator param grid

In [7]:
estimators = [
    {
        "clf": [RandomForestClassifier()],
        "clf__bootstrap": [True, False],
        "clf__max_depth": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        "clf__max_features": ["auto", "sqrt"],
        "clf__min_samples_leaf": [1, 2, 4],
        "clf__min_samples_split": [2, 5, 10],
        "clf__n_estimators": [100, 200, 400, 500]
    },
    {
        "clf": [XGBClassifier()],
        "clf__objective": ["binary:logistic"],
        "clf__use_label_encoder": [False],
        "clf__eval_metric": ["logloss"],
        "clf__learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        "clf__min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        "clf__gamma": [0.5, 1, 1.5, 2, 5],
        "clf__subsample": np.random.uniform(1, .7, 1),
        "clf__colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7 ],
        "clf__max_depth": [3, 4, 5, 6, 7, 8],
        "clf__n_estimators": np.arange(100, 500, 10),
        
    },
    {
        "clf": [LGBMClassifier()],
        "clf__num_leaves": sp_randint(6, 50), 
        "clf__min_child_samples": sp_randint(100, 500), 
        "clf__min_child_weight": [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        "clf__subsample": np.random.uniform(0.2, 0.8, 1), 
        "clf__colsample_bytree": np.random.uniform(0.4, 0.6, 1),
        "clf__reg_alpha": [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
        "clf__reg_lambda": [0, 1e-1, 1, 5, 10, 20, 50, 100],
    },  
]

## Load best estimator

In [8]:
with open(f"artifacts/{sts.BEST_ESTIMATOR_FILENAME}", "rb") as file:
    best_estimator = pickle.load(file)

In [9]:
rs = RandomizedSearchCV(
    best_estimator, 
    estimators, 
    cv=5,
    scoring="average_precision",
    n_jobs=1, 
    verbose=0, 
    n_iter=100, 
    random_state=42,
)

In [10]:
rs.fit(X_train, y_train)

2021-09-11 16:14:21.678973 INFO: Hard mode for the ModifiedColumnTransformer set to True: The initial features are going to be enforced during transformation and fit steps
2021-09-11 16:14:21.691374 INFO: Hard mode for the ModifiedColumnTransformer set to True: The initial features are going to be enforced during transformation and fit steps
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
2021-09-11 16:14:21.902516 INFO: Hard mode for the ModifiedColumnTransformer set to True: The initial features are going to be enforced during transformation and fit steps
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
2021-09-11 16:14:22.156804 INFO: Hard mode for the ModifiedColumnTransformer set to True: The initial features are going to be enforced during transformation and fit steps
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ModifiedColumnTransformer(categorical_features=['CODE_GENDER',
                                                                                              'FLAG_OWN_CAR',
                                                                                              'FLAG_OWN_REALTY',
                                                                                              'NAME_INCOME_TYPE',
                                                                                              'NAME_EDUCATION_TYPE',
                                                                                              'NAME_FAMILY_STATUS',
                                                                                              'NAME_HOUSING_TYPE',
                                                                                              'FLAG_MOBIL',
           

In [17]:
cv_results_df = pd.DataFrame(rs.cv_results_)
cv_results_df["model"] = cv_results_df["param_clf"].apply(lambda x: str(x)[:8])
cv_results_df.iloc[cv_results_df.groupby("model")["rank_test_score"].idxmin()][["param_clf","mean_fit_time","mean_test_score","rank_test_score"]].round(2)

Unnamed: 0,param_clf,mean_fit_time,mean_test_score,rank_test_score
89,LGBMClassifier(),0.44,0.82,33
43,"RandomForestClassifier(bootstrap=False, max_de...",8.01,0.9,1
90,"XGBClassifier(base_score=None, booster=None, c...",3.48,0.88,17


In [12]:
pd.DataFrame(rs.cv_results_).query("rank_test_score == 1")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__colsample_bytree,param_clf__min_child_samples,param_clf__min_child_weight,param_clf__num_leaves,param_clf__reg_alpha,...,param_clf__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
43,8.010047,0.587472,0.291033,0.01188,"RandomForestClassifier(bootstrap=False, max_de...",,,,,,...,5,{'clf': RandomForestClassifier(bootstrap=False...,0.899542,0.92702,0.848971,0.890454,0.921771,0.897552,0.027819,1


## Dump trained best estimator

In [13]:
with open(f"artifacts/{sts.TRAINED_BEST_ESTIMATOR_FILENAME}","wb") as file:
    pickle.dump(rs.best_estimator_, file)