In [1]:
# ! pip install shap keras tensorflow

# Model definition

## Settings

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import helpers.settings as sts
sts.print_settings(sts)

[1m[91mBEST_ESTIMATOR_FILENAME : best_estimator_0.0.1.pkl
[1m[91mDATASET_TRAIN_FILENAME : dataset_train.parquet
[1m[91mDATASET_VALIDATION_FILENAME : dataset_validation.parquet
[1m[91mETL_VERSION : 0.0.1
[1m[91mMODEL_FILENAME : model.pkl
[1m[91mMODEL_VERSION : 0.0.1
[1m[91mPREPROCESSOR_FILENAME : preprocessor_0.0.1.pkl
[1m[91mTRAINED_BEST_ESTIMATOR_FILENAME : trained_best_estimator_0.0.1.pkl
[1m[91mcolor : <class 'helpers.settings.color'>
[1m[91mprint_settings : <function print_settings at 0x7ffdbb9b9ee0>
[0m


## Imports

In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    plot_precision_recall_curve,
    precision_recall_curve,
    average_precision_score,
    plot_confusion_matrix,
)
import re
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import shap
import matplotlib.pyplot as plt  
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

##  Load dataset

In [6]:
df_train = pd.read_parquet(f"data/{sts.DATASET_TRAIN_FILENAME}")

In [7]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

## Load preprocessor

In [8]:
with open(f"artifacts/{sts.PREPROCESSOR_FILENAME}","rb") as file:
    preprocessor = pickle.load(file)

## Model testing

### Deep Learning

In [16]:
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(curve='PR')])
    return model

In [17]:
model = KerasClassifier(build_fn=create_model, epochs=6, batch_size=20, verbose=1)

In [18]:
clf_deep_learning = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("imputer", SimpleImputer(fill_value=0)),
        ("scaler", StandardScaler()),
        ("clf", model),
    ]
)

In [19]:
clf_deep_learning.fit(X_train, y_train)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


Pipeline(steps=[('preprocessor',
                 ModifiedColumnTransformer(categorical_features=['CODE_GENDER',
                                                                 'FLAG_OWN_CAR',
                                                                 'FLAG_OWN_REALTY',
                                                                 'NAME_INCOME_TYPE',
                                                                 'NAME_EDUCATION_TYPE',
                                                                 'NAME_FAMILY_STATUS',
                                                                 'NAME_HOUSING_TYPE',
                                                                 'FLAG_MOBIL',
                                                                 'FLAG_WORK_PHONE',
                                                                 'FLAG_PHONE',
                                                                 'FLAG_EMAIL'],
                                           numeric_fea

In [20]:
y_proba = clf_deep_learning.predict_proba(X_train)[:,1]
y_pred = np.where(y_proba >= 0.5, 1, 0)

In [21]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     30584
           1       1.00      0.05      0.09       404

    accuracy                           0.99     30988
   macro avg       0.99      0.52      0.54     30988
weighted avg       0.99      0.99      0.98     30988



### Others

In [None]:
estimators = [
    {
        "clf": [XGBClassifier()],
    },
    {
        "clf": [LGBMClassifier()],
    },
    {
        "clf": [SVC()],
    },
    {
        "clf": [RandomForestClassifier()],
    },
    {
        "clf": [AdaBoostClassifier()],
    },
]

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("clf", XGBClassifier()),
    ],
)

In [None]:
gs = GridSearchCV(
    clf,
    estimators,
    scoring="average_precision",
    cv=5,
    n_jobs=1, 
    verbose=0,
)

In [None]:
gs.fit(X_train, y_train)

In [None]:
cv_results = pd.DataFrame(gs.cv_results_)
cv_results.set_index(cv_results["param_clf"].apply(lambda x: f"{x}"[:7]), inplace=True)
cv_results

In [None]:
cv_results_splits = cv_results[[col for col in cv_results.columns if "split" in col]]
cv_results_splits.columns = [re.sub("[^0-9]", "", col) if "split" in col else col for col in cv_results_splits.columns]
cv_results_splits.T.plot()

In [None]:
y_proba = gs.best_estimator_.predict_proba(X_train)[:,1]
y_pred = np.where(y_proba >= 0.5, 1, 0)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
plot_confusion_matrix(gs.best_estimator_, X_train, y_train)

## Dump best estimator

In [None]:
with open(f"artifacts/{sts.BEST_ESTIMATOR_FILENAME}","wb") as file:
    pickle.dump(gs.best_estimator_, file)