# Libraries

In [2]:
import os
import sys
from statistics import mean
import glob
import chardet
from IPython.core import display as ICD
from tqdm.notebook import tqdm

# visualisation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['savefig.facecolor'] = "0.8"
plt.rcParams.update({'figure.figsize': (15, 5), 'figure.dpi': 120})
plt.style.use('fivethirtyeight')

# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report

# from pytorch_tabular.utils import get_balanced_sampler, get_class_weighted_cross_entropy

# models
# from lazypredict.Supervised import LazyClassifier
# import tensorflow as tf
# from tensorflow import keras
# from pytorch_tabular import TabularModel
# from pytorch_tabular.models import CategoryEmbeddingModelConfig
# from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

# from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

# pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline

# performance
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_auc_score, plot_roc_curve, roc_curve
from scikitplot.estimators import plot_learning_curve
from sklearn.model_selection import learning_curve

# autoML
# from autosklearn.classification import AutoSklearnClassifier

# explainability
# from shap import summary_plot, TreeExplainer

# fine-tuning
from sklearn.model_selection import GridSearchCV

# model zxport
import joblib

# ignore warnings
# from sklearn.utils.testing import ignore_warnings
import warnings
warnings.filterwarnings('ignore')

# Data fetching

In [28]:
def data_fetch_parse(path: str, all_files=False):
    # Get CSV files list from a folder
    csv_files = glob.glob(path + "/*.csv")
    csv_files_name = [file for file in os.listdir(
        path) if file.endswith(".csv")]

    # Read each CSV file into DataFrame
    # This creates a list of dataframes
    df_list = {}
    for file, file_name in zip(csv_files, csv_files_name):
        # Look at the first ten thousand bytes to guess the character encoding with confidence interval
        with open(file, 'rb') as rawdata:
            encod_type = chardet.detect(rawdata.read(10000))["encoding"]
            # parse data with given file path
            df = pd.read_csv(file, sep=None, delim_whitespace=None,
                             encoding=encod_type, engine="python")
            if set(["SK_ID_CURR"]).issubset(df.columns):
                df = df.drop("SK_ID_CURR", axis=1)
                df_list[file_name] = df
            else:
                df_list[file_name] = df
                
    # # define train, test parsed named csv files
    TRAIN, TEST = df_list["application_train.csv"], df_list["application_test.csv"]

    return TRAIN, TEST
    # return df_list



In [30]:
local_path = "../data/csv/train_test"
TRAIN, TEST = data_fetch_parse(local_path)

# Model building

## Preprocessing

In [31]:
def preprocessing(data: pd.DataFrame, n_corr_fts: int):
    """
    Function for data preprocessing

    Parameters
    ----------
    data: pd.DataFrame
        Pandas dataframe containing data variables

    Returns
    -------
    X_train: np.array
    y_train: np.array
    X_test: np.array
    y_test: np.array
    """
    df = data.copy()  # define dataset copy 

    corr_fts = data.corr()[["TARGET"]].sort_values("TARGET", ascending=False).head(n_corr_fts).index  # retrieve nine most target correlated features
    df = df[corr_fts]

    df = df.drop_duplicates()  # drop duplicates
    df = df.dropna()  # drop null values
    # df.drop(["FLAG_DOCUMENT_3", "REG_CITY_NOT_LIVE_CITY", "REG_CITY_NOT_LIVE_CITY"], axis=1, inplace=True)  # drop co-correlated features 

    """Select features with "days" named headers to convert them into years"""
    for col in df.columns:
        if "days" in col.lower():
            df[col] = df[col].abs()
            df[col] = df[col].div(365)
            df[col] = df[col].round(decimals=0)
            df[col] = pd.to_numeric(df[col], downcast="integer")

    X = df.drop("TARGET", axis=1)
    y = df["TARGET"]

    cat_col_names = list(["REGION_RATING_CLIENT", "REG_CITY_NOT_WORK_CITY", "FLAG_EMP_PHONE"])  # define categorical features name list
    num_col_names = list(X.drop(cat_col_names, axis=1).columns)  # define discrete numerical features name list

    X = StandardScaler().fit_transform(X[num_col_names])  # standard scale numerical features 

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)  # train / test split
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)  # Over-sampling on minority class because of unbalancing 

    return X_train, y_train, X_test, y_test
    

In [33]:
X_train, y_train, X_test, y_test = preprocessing(TRAIN, n_corr_fts=10)

## ML Models | Sklearn

In [34]:
def classification_models(models_list: list, X_train, y_train, X_test, y_test, only_models=False):
    """
    Function to instantiate a model or a series of models in a given list, 

    Parameters
    ----------
    models_list: list
        List containing a model or series of models to instantiate
    X_train
    y_train
    X_test
    y_test

    Returns
    Plot with models metrics, and a dictionary containing trained models.
    """
    classifiers = models_list
    # names = ["RBF SVM", "Random Forest", "Extremely Randomized Trees", "XGBoost"]
    names = [clf.__class__.__name__ for clf in models_list]

    monit_ls = []
    models_trained = {}

    for name, clf in zip(names, classifiers):
        model = clf.fit(X_train, y_train)
        models_trained[name] = model
        y_predict = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_predict)
        recall = recall_score(y_test, y_predict)
        monit_ls.append([name, accuracy, recall])

    if not only_models:
        monit_df = pd.DataFrame(monit_ls, columns=["Model", "Accuracy", "Recall"])
        monit_df.style.background_gradient(cmap='Greens')

        ICD.display(monit_df)

    else:
        return models_trained


In [23]:
models = [RandomForestClassifier(), XGBClassifier(), ExtraTreesClassifier(), LogisticRegression()]

classification_models(models_list=models, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

Unnamed: 0,Model,Accuracy,Recall
0,RandomForestClassifier,0.931202,0.001943
1,XGBClassifier,0.932937,0.001295
2,ExtraTreesClassifier,0.924478,0.009715
3,LogisticRegression,0.591854,0.579016


In [35]:
models_trained_dict = classification_models(models_list=models, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, only_models=True)

In [13]:
class performance_plots():

    # Learning curves
    def learning_curves(model, features, target):
        train_sizes, train_scores, test_scores = learning_curve(estimator=model,
                                                                X=features,
                                                                y=target,
                                                                cv=None,
                                                                scoring='recall',
                                                                shuffle=True)

        # Take the mean of cross-validated train scores and validation scores
        train_scores_mean = np.mean(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)

        # Plot the learning curves!
        plt.plot(train_sizes, train_scores_mean, label='Training score')
        plt.plot(train_sizes, test_scores_mean, label='Test score')
        plt.ylabel('Recall', fontsize=14)
        plt.xlabel('Training set size', fontsize=14)
        plt.title('Learning curves', fontsize=18, y=1.03)
        plt.legend()
        
        plt.savefig("../reports/model/logistic_learning_curve.png")
        plt.show()

    # ROC_AUC curves
    def roc_auc(model, X_tst, y_tst):
        plot_roc_curve(model, X_tst, y_tst)
        plt.title('ROC Curve')
        plt.plot([0, 1], [0, 1], 'r--')
        
        plt.savefig("../reports/model/logistic_roc_auc.png")
        plt.show()

    # Confusion matrix
    def plot_confusion_matrix(y_tst, y_hat):
        rec = round(recall_score(y_tst, y_hat), 2)
        cm = confusion_matrix(y_tst, y_hat)
        sns.heatmap(cm, annot=True, fmt=".0f")
        plt.xlabel('y_pred')
        plt.ylabel('y')
        plt.title('Recall Score: {0}'.format(rec), size=20)

        plt.savefig("../reports/model/logistic_confusion_matrix.png")
        plt.show()


In [36]:
param_grid = {"penalty": ["l1", "l2", "elasticnet", "none"], "dual": [False, True], "fit_intercept": [False, True], "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"], "warm_start": [False, True]}

In [37]:
clf = GridSearchCV(models_trained_dict["LogisticRegression"], param_grid, scoring="recall", verbose=4, n_jobs=-1, refit=True, return_train_score=True)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'dual': [False, True], 'fit_intercept': [False, True],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga'],
                         'warm_start': [False, True]},
             return_train_score=True, scoring='recall', verbose=4)

In [None]:
# setting pipeline
corr_fts = TRAIN.corr()[["TARGET"]].sort_values(
    "TARGET", ascending=False).head(10).index

TRAIN_SET = TRAIN[corr_fts].drop(
    ["FLAG_DOCUMENT_3", "REG_CITY_NOT_LIVE_CITY", "REG_CITY_NOT_LIVE_CITY"], axis=1)
TRAIN_SET.dropna(inplace=True)

X = TRAIN_SET.drop("TARGET", axis=1)
y = TRAIN_SET["TARGET"]

categorical_features = list(
    ["REGION_RATING_CLIENT", "REG_CITY_NOT_WORK_CITY", "FLAG_EMP_PHONE"])
numerical_features = list(X.drop(cat_col_names, axis=1).columns)

numeric_transformer = Pipeline(
    steps=[("standard_scaler", StandardScaler())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
    ],
    remainder="passthrough"
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
# clf_pipe = imbpipeline(
#     steps=[("preprocessor", preprocessor), ("SMOTE", SMOTE()), ("classifier", GridSearchCV(LogisticRegression(), param_grid= {"penalty": ["l1", "l2", "elasticnet", "none"], "dual": [False, True], "fit_intercept": [False, True], "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"], "warm_start": [False, True]}, scoring="recall", n_jobs=-1, refit=True))]
# )
clf_pipe = imbpipeline(
    steps=[("preprocessor", preprocessor), ("SMOTE", SMOTE()), ("classifier", LogisticRegression(C=1.0,
                                                                                                 class_weight=None,
                                                                                                 dual=False,
                                                                                                 fit_intercept=False,
                                                                                                 intercept_scaling=1,
                                                                                                 l1_ratio=None,
                                                                                                 max_iter=100,
                                                                                                 multi_class="auto",
                                                                                                 n_jobs=None,
                                                                                                 penalty="l1",
                                                                                                 random_state=None,
                                                                                                 solver="liblinear",
                                                                                                 tol=0.0001,
                                                                                                 verbose=0,
                                                                                                 warm_start=False))]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

clf_pipe.fit(X_train, y_train)
y_pred = clf_pipe.predict(X_test)
print(classification_report(y_test, y_pred))


In [38]:
model_clf = joblib.dump(value=clf, filename="../models/logistic_model.pkl")

## Auto-sklearn

In [6]:
corr_fts = TRAIN.corr()[["TARGET"]].sort_values("TARGET", ascending=False).head(10).index

In [7]:
TRAIN_SET = TRAIN[corr_fts].dropna()

X = TRAIN_SET.drop("TARGET", axis=1)
y = TRAIN_SET["TARGET"]

# cat_col_names = list(["REGION_RATING_CLIENT_W_CITY", "REGION_RATING_CLIENT", "FLAG_DOCUMENT_3", "REG_CITY_NOT_WORK_CITY", "FLAG_EMP_PHONE", "REG_CITY_NOT_LIVE_CITY"])
cat_col_names = list(["REGION_RATING_CLIENT_W_CITY", "REGION_RATING_CLIENT", "REG_CITY_NOT_WORK_CITY", "FLAG_EMP_PHONE", "REG_CITY_NOT_LIVE_CITY"])
num_col_names = list(X.drop(cat_col_names, axis=1).columns)

X = StandardScaler().fit_transform(X[num_col_names])
# y = StandardScaler().fit(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [12]:
classifier = AutoSklearnClassifier(
    time_left_for_this_task=300,
    memory_limit=None,
    include = {"feature_preprocessor": ["no_preprocessing"]
    },
    n_jobs=-1)

In [13]:
classifier.fit(X_train, y_train)



AutoSklearnClassifier(include={'feature_preprocessor': ['no_preprocessing']},
                      memory_limit=None, n_jobs=-1, per_run_time_limit=240,
                      time_left_for_this_task=300)

In [19]:
print(classifier.leaderboard())

          rank  ensemble_weight               type      cost    duration
model_id                                                                
14           1             0.44  gradient_boosting  0.214664  125.690800
15           2             0.02  gradient_boosting  0.289001   55.379998
20           3             0.54                qda  0.436373    2.840280


In [20]:
predictions = classifier.predict(X_test)
print("Recall score:", recall_score(y_test, predictions))

Accuracy score: 0.3028225163450805


## PyTorch tabular

In [10]:
corr_fts = TRAIN.corr()[["TARGET"]].sort_values("TARGET", ascending=False).head(10).index

In [13]:
cat_col_names = list(["REGION_RATING_CLIENT_W_CITY", "REGION_RATING_CLIENT", "REG_CITY_NOT_WORK_CITY", "FLAG_EMP_PHONE", "REG_CITY_NOT_LIVE_CITY", "FLAG_DOCUMENT_3"])
num_col_names = list(TRAIN[corr_fts].drop(cat_col_names, axis=1).drop("TARGET", axis=1).columns)

In [11]:
TRAIN_PYTORCH = TRAIN[corr_fts].dropna()
TRAIN_PYTORCH.isnull().sum()

TARGET                         0
DAYS_BIRTH                     0
REGION_RATING_CLIENT_W_CITY    0
REGION_RATING_CLIENT           0
DAYS_LAST_PHONE_CHANGE         0
DAYS_ID_PUBLISH                0
REG_CITY_NOT_WORK_CITY         0
FLAG_EMP_PHONE                 0
REG_CITY_NOT_LIVE_CITY         0
FLAG_DOCUMENT_3                0
dtype: int64

In [33]:
X_train, X_test = train_test_split(TRAIN_PYTORCH, test_size=.3, shuffle=True, stratify=y)
X_train = SMOTE().fit_resample(X_train, X_train["TARGET"])  

(        TARGET  DAYS_BIRTH  REGION_RATING_CLIENT_W_CITY  REGION_RATING_CLIENT  \
 0            0       -9300                            2                     2   
 1            0      -12663                            3                     3   
 2            0      -10043                            2                     2   
 3            0      -11283                            2                     2   
 4            0      -22933                            2                     2   
 ...        ...         ...                          ...                   ...   
 395755       1      -19051                            2                     2   
 395756       1       -9591                            2                     2   
 395757       1      -20055                            2                     2   
 395758       1      -10981                            2                     2   
 395759       1      -13721                            2                     2   
 
         DAYS_

In [22]:
data_config = DataConfig(
    # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    target=["TARGET"],
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=None,  # index of the GPU to use, -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # number of nodes in each layer
    activation="LeakyReLU",  # activation between each layers
    learning_rate=1e-3,
    metrics=["recall", "f1"],
    metrics_params=[{"num_classes": 2}, {}]
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)


In [None]:
# weighted_loss = get_class_weighted_cross_entropy(X_train["TARGET"].values.ravel(), mu=0.1)  # class imbalance handling with smoothed class weights and weighted loss initialization 
# sampler = get_balanced_sampler(X_train["TARGET"].values.ravel())  # WeightedRandomSampler implementation using inverse frequency sampling to combat imbalance
tabular_model.fit(train=X_train)

In [46]:
result = tabular_model.predict(X_test.drop("TARGET", axis=1))

Generating Predictions...: 100%|██████████| 91/91 [00:02<00:00, 35.48it/s]


In [47]:
result

Unnamed: 0,DAYS_BIRTH,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,DAYS_ID_PUBLISH,REG_CITY_NOT_WORK_CITY,FLAG_EMP_PHONE,REG_CITY_NOT_LIVE_CITY,FLAG_DOCUMENT_3,0_probability,1_probability,prediction
280794,-12053,2,2,-1939.0,-3569,1,1,0,1,0.727038,0.272962,0
263702,-10916,2,2,-3.0,-3534,0,1,0,1,0.703117,0.296883,0
36464,-10066,1,1,-1059.0,-2511,1,1,1,0,0.948764,0.051236,0
241741,-15064,1,1,-239.0,-4112,0,1,0,1,0.849572,0.150428,0
133587,-22026,2,2,-1237.0,-4646,0,1,0,1,0.827132,0.172868,0
...,...,...,...,...,...,...,...,...,...,...,...,...
293086,-9975,3,3,-1653.0,-2327,0,1,0,1,0.671546,0.328454,0
46232,-11631,1,1,-2040.0,-3984,1,1,0,1,0.905132,0.094868,0
109168,-15733,2,2,-1741.0,-4150,0,1,0,1,0.756485,0.243515,0
210126,-20035,2,2,-2154.0,-3571,0,1,0,0,0.811613,0.188388,0


In [48]:
conf_matrix = confusion_matrix(y_true=X_test["TARGET"], y_pred=result["prediction"])

In [49]:
conf_matrix

array([[84725,     0],
       [ 7528,     0]], dtype=int64)

## Tensorflow

Work in progress!!