In [3]:
# Libraries import
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.set_option('display.max_columns', None)
import re
import time
import gc
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn import decomposition
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, f1_score, fbeta_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import linear_model, neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV , train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

## **Getting data from Kernel output and first preprocess**

Using the "Home_credit_risk_data_modeling.csv" which is the output from the choosen kernel (https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features) to get X_train_sample and X_test_sample.<br/>
We will only use X_train_sample for our project

In [4]:
df_data_mod = pd.read_csv('../input/home-risk-preprocess/Home_credit_risk_data_modeling.csv', index_col=0)
df_data_mod.info()
columns_no_nan = [i for i in df_data_mod.columns if df_data_mod[i].isnull().sum() == 0]
df_train_sample = df_data_mod.loc[~(df_data_mod['TARGET'].isnull())]
df_test_sample = df_data_mod.loc[df_data_mod['TARGET'].isnull()]
del df_data_mod
gc.collect()

In [5]:
# Defining X and y with train data
X = df_train_sample.drop(columns = ['TARGET','SK_ID_CURR','index']).copy()
y = df_train_sample['TARGET'].copy()
# keep SK_ID_CURR for later evantually
index_global_train_SK_ID_CURR = df_train_sample['SK_ID_CURR']
# Checking inf
np.isinf(X).values.sum()

In [6]:
# Replace inf values by max() and min() values and check
col_min_max = {np.inf: X[np.isfinite(X)].max(),    # column-wise max
              -np.inf: X[np.isfinite(X)].min()}    # column-wise min

X = X.replace({col: col_min_max for col in X.columns})
np.isinf(X).values.sum()

In [7]:
# Checking NaN values
X.isna().sum()

In [9]:
# Function for features detection before imputation, numerical continue or numerical discrete
def list_features_att(df):
    '''Function to attribute to columns of a dataframe a category into a several list
    it returns two lists.
    --> Numerical: num_list
    --> discrete_numerical: num_dis_list'''
    
    num_list = list()
    num_dis_list = list()
    for name in df.columns:
        if df[name].nunique() < 10:
            num_dis_list.append(name)
        else:
            num_list.append(name)
    return num_list, num_dis_list

In [10]:
# Imputation in function of the numeric format
numerical_cont_list, numerical_disc_list = list_features_att(X)
time_init = time.time()
#imputation on numeric continue columns with 'mean' strategy
X[numerical_cont_list] = SimpleImputer(strategy='mean').fit_transform(X[numerical_cont_list])
#imputation on  discrete numeric columns with 'most_frequent' strategy
X[numerical_disc_list] = SimpleImputer(strategy='most_frequent').fit_transform(X[numerical_disc_list])
print (f'time for imputation: {time.time() - time_init:.2f}')

## **Sample our data to make ModelSearch easier**

 We have 795 features and 307507 rows! Let's reduce our data with train_test_split with stratified option to keep 5% of our data.<br/>
 It will only be used for the search of a potential best model

In [8]:
# keeping only train result in our split
X1,_,y1,_ = train_test_split(X, y, train_size=0.05, stratify=y, random_state=4)
# verifying that our problem is still imbalanced
figure1 = plt.figure(figsize=(10,7), facecolor='w')
plt.title('Répartion des crédits payés à temps et avec difficultés dans notre sample', fontweight='bold', pad=20)
plt.axis('off')
x = y1.value_counts()
expl = [0 if i/x.sum() > 0.02 else 0.8 for i in x]
plt.pie(x, labels=[i for i in x.index], autopct="%.1f%%", pctdistance=0.8,
        explode=expl, textprops={'fontsize': 10, 'fontweight': 'bold'})
plt.legend(['Remboursés', 'Difficultés de paiement'])
plt.show()
print(f'sample features shape:{X1.shape}, sample target shape: {y1.shape}')

Finally we have 15375 row with the same imbalanced state as our complete data

## **Applying splitting and standardScaler to the data reduced before searching for best model**

In [9]:
time_init = time.time()
# Data split (80% of train set)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, train_size=0.8, stratify=y1, random_state=4)
# Normalization of the imput data
std_scaler = StandardScaler()
std_scaler.fit(X1_train)
X1_train = std_scaler.transform(X1_train)
X1_test = std_scaler.transform(X1_test)
print (f'Time for Spliting and Normalize: {time.time() - time_init:.2f}')

 ## **Best model research: tests with SMOTE, RUS for balancing our problem**

### **Train dummy classifier for having a baseline on our test data**

In [10]:
dummy_const_mod = DummyClassifier(strategy='most_frequent', random_state=4)
time_init = time.time()
dummy_const_mod.fit(X1_train, y1_train)
time_1 = time.time()
y_pred_dumm_const_1 = dummy_const_mod.predict(X1_test)
time_2 = time.time()
y_pred_pro_dumm_const_1 = dummy_const_mod.predict_proba(X1_test)[:,1]
# create df for global model results
summary_result = list()
summary_result.append({'model':'Dummy_stratified', 'Score AUC':roc_auc_score(y1_test, y_pred_pro_dumm_const_1), 
                       'F1_score': f1_score(y1_test, y_pred_dumm_const_1), 'F2_score': fbeta_score(y1_test, y_pred_dumm_const_1, beta=2), 
                       'Precision_score': precision_score(y1_test,y_pred_dumm_const_1), 'Recall_score': recall_score(y1_test, y_pred_dumm_const_1),
                       'Train Time(s)':time_1 - time_init, 'Predict Time(s)': time_2 - time_1,'Comment': 'test data score'})
df_summary_result1 = pd.DataFrame(summary_result)
df_summary_result1

### **Model selection with RandomSearchCV and reduced data**

#### RandomSearchCV with Logistic Regression

In [11]:
# Defining the samplers
smote = SMOTE(random_state=4)
rus = RandomUnderSampler(random_state=4)

# create the imblearn pipeline and grid parameters
# be carefull be sure to use pipeline from imblearn to use SMOTE sampler, it will not evaluate result on y_test create with it
time_init = time.time()
scores_CV5_1 = list()
for sampler in [smote, rus]:
    pipe = Pipeline([('Sampling', sampler), ('classifier', linear_model.LogisticRegression(random_state=4, max_iter=10000))])
    param_grid1 = {'classifier__C':np.logspace(1,3,100)}
    # perfomring rand search CV
    gride_mod = RandomizedSearchCV(pipe, param_distributions = param_grid1, cv = 5, scoring='roc_auc', n_iter=10, n_jobs=-1, random_state=4)
    gride_mod.fit(X1_train, y1_train)
    scores_CV5_1.append({'model':'Logistic_Regression', 'best_score AUC':gride_mod.best_score_, 'best parameters':gride_mod.best_params_,
                       'Best model':gride_mod.best_estimator_,'Refit Time(s)':gride_mod.refit_time_, 'Comment':f'{sampler} train data'})

# create df results
print(f'RandomSearchCV execution time(s): {time.time() - time_init}')
df_scores_CV5_1 = pd.DataFrame(scores_CV5_1)
df_scores_CV5_1

#### RandomSearchCV with KNeighborsClassifier

In [12]:
# Defining the samplers
smote = SMOTE(random_state=4)
rus = RandomUnderSampler(random_state=4)

# create the imblearn pipeline and grid parameters
# be carefull be sure to use pipeline from imblearn to use SMOTE sampler, it will not evaluate result on y_test create with it
time_init = time.time()
scores_CV5_1 = list()
for sampler in [smote, rus]:
    pipe = Pipeline([('Sampling', sampler), ('classifier', neighbors.KNeighborsClassifier())])
    param_grid1 = {'classifier__n_neighbors':range(2,300,1)}
    # perfomring rand search CV
    gride_mod = RandomizedSearchCV(pipe, param_distributions = param_grid1, cv = 5, scoring='roc_auc', n_iter=10, n_jobs=-1, random_state=4)
    gride_mod.fit(X1_train, y1_train)
    scores_CV5_1.append({'model':'Kneighbors classifier', 'best_score AUC':gride_mod.best_score_, 'best parameters':gride_mod.best_params_,
                       'Best model':gride_mod.best_estimator_,'Refit Time(s)':gride_mod.refit_time_, 'Comment':f'{sampler} train data'})
   
# create df results
print(f'RandomSearchCV execution time(s): {time.time() - time_init}')
df_scores_CV5_1 = df_scores_CV5_1.append(scores_CV5_1, ignore_index=True)
df_scores_CV5_1

#### RandomSearchCV with RandomForest Classifier

In [13]:
# create the imblearn pipeline and grid parameters
# be carefull be sure to use pipeline from imblearn to use SMOTE sampler, it will not evaluate result on y_test create with it
time_init = time.time()
scores_CV5_1 = list()
for sampler in [smote, rus]:
    pipe = Pipeline([('Sampling', sampler), ('classifier', RandomForestClassifier(random_state=4))])
    param_grid1 = {'classifier__n_estimators':range(5,70,5), 'classifier__max_depth': [5, 7, 10]}
    # perfomring rand search CV
    gride_mod = RandomizedSearchCV(pipe, param_distributions = param_grid1, cv = 5, scoring='roc_auc', n_iter=10, n_jobs=-1, random_state=4)
    gride_mod.fit(X1_train, y1_train)
    scores_CV5_1.append({'model':'RandomForest classifier', 'best_score AUC':gride_mod.best_score_, 'best parameters':gride_mod.best_params_,
                       'Best model':gride_mod.best_estimator_,'Refit Time(s)':gride_mod.refit_time_, 'Comment':f'{sampler} train data'})

# create df results
print(f'RandomSearchCV execution time(s): {time.time() - time_init}')
df_scores_CV5_1 = df_scores_CV5_1.append(scores_CV5_1, ignore_index=True)
df_scores_CV5_1

#### RandomSearchCV with XGBoost classifier

In [14]:
# create the imblearn pipeline and grid parameters
# be carefull be sure to use pipeline from imblearn to use SMOTE sampler, it will not evaluate result on y_test create with it
time_init = time.time()
scores_CV5_1 = list()
for sampler in [smote, rus]:
    pipe = Pipeline([('Sampling', sampler), ('classifier', XGBClassifier(use_label_encoder=False, random_state=4, eval_metric='mlogloss'))])
    param_grid1 = {'classifier__n_estimators':range(5,70,5), 'classifier__max_depth': [5, 7, 10], 'classifier__learning_rate':[0.1, 0.5]}
    # perfomring rand search CV
    gride_mod = RandomizedSearchCV(pipe, param_distributions = param_grid1, cv = 5, scoring='roc_auc', n_iter=10, n_jobs=-1, random_state=4)
    gride_mod.fit(X1_train, y1_train)
    scores_CV5_1.append({'model':'XGBoost classifier', 'best_score AUC':gride_mod.best_score_, 'best parameters':gride_mod.best_params_,
                       'Best model':gride_mod.best_estimator_,'Refit Time(s)':gride_mod.refit_time_, 'Comment':f'{sampler} train data'})

# create df results
print(f'RandomSearchCV execution time(s): {time.time() - time_init}')
df_scores_CV5_1 = df_scores_CV5_1.append(scores_CV5_1, ignore_index=True)
df_scores_CV5_1

#### RandomSearchCV with LGBM classifier

In [15]:
# create the imblearn pipeline and grid parameters
# be carefull be sure to use pipeline from imblearn to use SMOTE sampler, it will not evaluate result on y_test create with it
time_init = time.time()
scores_CV5_1 = list()
for sampler in [smote, rus]:
    pipe = Pipeline([('Sampling', sampler), ('classifier', LGBMClassifier(random_state=4))])
    param_grid1 = {'classifier__n_estimators':range(5,70,5), 'classifier__max_depth': [5, 7, 10], 'classifier__learning_rate':[0.1, 0.5]}
    # perfomring rand search CV
    gride_mod = RandomizedSearchCV(pipe, param_distributions = param_grid1, cv = 5, scoring='roc_auc', n_iter=10, n_jobs=-1, random_state=4)
    gride_mod.fit(X1_train, y1_train)
    scores_CV5_1.append({'model':'LGBM classifier', 'best_score AUC':gride_mod.best_score_, 'best parameters':gride_mod.best_params_,
                       'Best model':gride_mod.best_estimator_,'Refit Time(s)':gride_mod.refit_time_, 'Comment':f'{sampler} train data'})

# create df results
print(f'RandomSearchCV execution time(s): {time.time() - time_init}')
df_scores_CV5_1 = df_scores_CV5_1.append(scores_CV5_1, ignore_index=True)
df_scores_CV5_1

 #### Validation scores over reduced data

In [17]:
# create list with probability on the validation test
y_pred_prob_list = list()
# add dummy classifier result to problist for our basline
y_pred_prob_list.append(y_pred_pro_dumm_const_1)
# Results for each best models with test data
for i in df_scores_CV5_1.index:
    model = df_scores_CV5_1.iloc[i,3]
    time_init = time.time()
    time_1 = time.time()
    y_pred_model = model.predict(X1_test)
    time_2 = time.time()
    y_pred_prob_model = model.predict_proba(X1_test)[:,1]
    # filling df with results for each models
    summary_result = list()
    summary_result.append({'model': df_scores_CV5_1.iloc[i, 0], 'Score AUC':roc_auc_score(y1_test, y_pred_prob_model), 
                       'F1_score': f1_score(y1_test, y_pred_model), 'F2_score': fbeta_score(y1_test, y_pred_model, beta=2),
                        'Precision_score': precision_score(y1_test,y_pred_model), 'Recall_score': recall_score(y1_test, y_pred_model),
                       'Train Time(s)':df_scores_CV5_1.iloc[i, 4], 'Predict Time(s)': time_2 - time_1 , 
                        'Comment': f'test data score with {df_scores_CV5_1.iloc[i, 5]}'})
    df_summary_result1 = df_summary_result1.append(summary_result, ignore_index=True)
    y_pred_prob_list.append(y_pred_prob_model)
df_summary_result1

##### ROC and Precision/Recall curves data reduced

In [33]:
# Function to plot ROC and Precision-Recall curves
def ROC_PR_curves(test_labels, list_pred_proba, models_name):
    '''Function to plot ROC and PR curves
    test_labels --> True labels to predicts, y_test for example 1_D
    list_pred_proba --> models predicts probalbility 1-D
    models_name --> list of models names in the same order than list_predict_proba'''
    # figure init
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,7))
    plt.suptitle('ROC / PR curves for our models (validation data)', fontweight='bold')
    ax1.plot([0,1], [0,1], 'k--')
    ax2.plot([0,1], [0,0], 'k--')
    ax1.set_title('ROC')
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax2.set_title('Precision / Recall')
    ax2.set_xlabel('Recall')
    ax2.set_ylabel('Precision')
    # counter for getting models names
    count=0
    for pred_prob in list_pred_proba:
        # fpr and tpr calcul
        fpr, tpr, _ = roc_curve(test_labels, pred_prob)
        # Precision and recall calcul
        pr, rec, _ = precision_recall_curve(test_labels, pred_prob)
        # Plot ROC curve
        ax1.plot(fpr, tpr, label=f'{models_name[count]} / AUC={roc_auc_score(test_labels, pred_prob):.3f}')
        # Plot PR curve
        ax2.plot(rec, pr, label=models_name[count])
        count+=1
    ax1.legend()
    # ax2.legend() optionnal
    plt.show()

In [32]:
ROC_PR_curves(y1_test, y_pred_prob_list, df_summary_result1['model'])

--> Our models are better has random model with the probability to reject on the most frequent class (never here) <br/>
--> Best scores are obtained with RUS and two models (RandomForest line 6 / LGBM line 10) seems to compete in terms of: <br>
* AUC scoring (it is our scoring method)
* Train time
* F2 score (interresting because it advantages the recall score over precision and this is what we want on our problematic)

 #### Validation scores for our two best models over all our data with RUS

In [34]:
# try to garbage X1 and y1 for memory usage
del X1, y1, X1_train, y1_train
gc.collect()

In [35]:
# spliting and use our overall data
time_init = time.time()
# Data split (80% of train set)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=4)
# get X_train and X_test index
X_train_index = X_train.index
X_test_index = X_test.index
# Normalization of the imput data
std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)
print (f'Time for Spliting and Normalize: {time.time() - time_init:.2f}')

In [37]:
# Trying our two RUS best model (line 5 and line 9 of our first results df_scores_CV5_1)
# defining new list for proba
summary_result1 = list()
y_pred_prob_list1 = list()
for i in [5, 9]:
    model = df_scores_CV5_1.iloc[i,3]
    time_init = time.time()
    model.fit(X_train, y_train)
    time_1 = time.time()
    y_pred_model = model.predict(X_test)
    time_2 = time.time()
    y_pred_prob_model = model.predict_proba(X_test)[:,1]
    # filling list with results for each models
    summary_result1.append({'model': df_scores_CV5_1.iloc[i, 0], 'Score AUC':roc_auc_score(y_test, y_pred_prob_model), 
                       'F1_score': f1_score(y_test, y_pred_model), 'F2_score': fbeta_score(y_test, y_pred_model, beta=2),
                        'Precision_score': precision_score(y_test,y_pred_model), 'Recall_score': recall_score(y_test, y_pred_model),
                       'Train Time(s)':df_scores_CV5_1.iloc[i, 4], 'Predict Time(s)': time_2 - time_1 , 'Comment': df_scores_CV5_1.iloc[i, 5]})
    y_pred_prob_list1.append(y_pred_prob_model)
df_summary_result2 = pd.DataFrame(summary_result1)
df_summary_result2

##### ROC and Precision/Recall curves

In [38]:
ROC_PR_curves(y_test, y_pred_prob_list1, df_summary_result2['model'])

--> We select LGBM for the next with parameters:<br>
* n_estimator = 60
* max_depth = 7
* learning_rate = 0.1
* random_state=4 <br>

--> We will try a GirdSearchCV around these parameters on a custom metric with this model