In [1]:
# ! pip install shap

# ETL

## Settings

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import helpers.settings as sts
sts.print_settings(sts)

[1m[91mDATASET_TRAIN_FILENAME : dataset_train.parquet
[1m[91mDATASET_VALIDATION_FILENAME : dataset_validation.parquet
[1m[91mETL_VERSION : 0.0.1
[1m[91mMODEL_FILENAME : model_0.0.1.pkl
[1m[91mMODEL_VERSION : 0.0.1
[1m[91mPREPROCESSOR_FILENAME : preprocessor_0.0.1.pkl
[1m[91mcolor : <class 'helpers.settings.color'>
[1m[91mprint_settings : <function print_settings at 0x7febd61b1ca0>
[0m


## Imports

In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    plot_precision_recall_curve,
    precision_recall_curve,
    average_precision_score,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import shap
import warnings
warnings.filterwarnings("ignore")

##  Load dataset

In [6]:
df_train = pd.read_parquet(f"data/{sts.DATASET_TRAIN_FILENAME}")

In [7]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

## Load preprocessor

In [8]:
with open(f"data/{sts.PREPROCESSOR_FILENAME}","rb") as file:
    preprocessor = pickle.load(file)

In [9]:
estimators = [
    {
        "clf": [XGBClassifier()],
    },
    {
        "clf": [LGBMClassifier()],
    },
    {
        "clf": [SVC()],
    },
    {
        "clf": [RandomForestClassifier()],
    },
    {
        "clf": [AdaBoostClassifier()],
    },
    {
        "clf": [MLPClassifier()],
    },
]

In [10]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("clf", XGBClassifier()),
    ],
)

In [11]:
gs = GridSearchCV(
    clf,
    estimators,
    scoring="average_precision",
    cv=5,
    n_jobs=1, 
    verbose=0,
)

In [12]:
gs.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ModifiedColumnTransformer(categorical_features=['CODE_GENDER',
                                                                                        'FLAG_OWN_CAR',
                                                                                        'FLAG_OWN_REALTY',
                                                                                        'NAME_INCOME_TYPE',
                                                                                        'NAME_EDUCATION_TYPE',
                                                                                        'NAME_FAMILY_STATUS',
                                                                                        'NAME_HOUSING_TYPE',
                                                                                        'FLAG_MOBIL',
                                                                       

In [13]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.993318,0.009956,0.047042,0.00036,"XGBClassifier(base_score=None, booster=None, c...","{'clf': XGBClassifier(base_score=None, booster...",0.880113,0.87814,0.821364,0.869512,0.891586,0.868143,0.024425,2
1,0.473597,0.030684,0.051169,0.002884,LGBMClassifier(),{'clf': LGBMClassifier()},0.871112,0.875338,0.830932,0.889061,0.865114,0.866311,0.019366,3
2,0.911109,0.063444,0.143738,0.004942,SVC(),{'clf': SVC()},0.015911,0.022607,0.015487,0.012837,0.011918,0.015752,0.003749,6
3,1.569232,0.032929,0.089043,0.003647,RandomForestClassifier(),{'clf': RandomForestClassifier()},0.895618,0.915581,0.854639,0.883772,0.915247,0.892971,0.022665,1
4,0.831478,0.041934,0.060107,0.002801,AdaBoostClassifier(),{'clf': AdaBoostClassifier()},0.042438,0.047477,0.045694,0.075366,0.046556,0.051506,0.01205,5
5,16.489649,2.089038,0.04866,0.002642,MLPClassifier(),{'clf': MLPClassifier()},0.326658,0.319375,0.177722,0.37729,0.391507,0.31851,0.075726,4


In [14]:
y_proba = gs.predict_proba(X_train)[:,1]
y_pred = np.where(y_proba >= 0.5, 1, 0)

In [15]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30584
           1       0.96      0.96      0.96       404

    accuracy                           1.00     30988
   macro avg       0.98      0.98      0.98     30988
weighted avg       1.00      1.00      1.00     30988

