# Import data and treat missing values

As observed in first iteration, the data is rather clean except some missing values

The fact that only 12 features included in the data means that we may skip the process of feature selection but we could consider feature engineering if needed

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from collections import Counter
from helper_func import *

In [2]:
train_path = 'G:/Github/GiveMeSomeCredits/data/cs-training.csv'
test_path = 'G:/Github/GiveMeSomeCredits/data/cs-test.csv'

In [3]:
df_train = pd.read_csv(train_path,index_col = 0).reset_index()
df_test = pd.read_csv(test_path,index_col = 0).reset_index()

In [30]:
def data_import_and_preprocess(resample = 'ROS',scale = True):
    #import
    train_path = 'G:/Github/GiveMeSomeCredits/data/cs-training.csv'
    test_path = 'G:/Github/GiveMeSomeCredits/data/cs-test.csv'
    df_train = pd.read_csv(train_path,index_col = 0).reset_index()
    df_test = pd.read_csv(test_path,index_col = 0).reset_index()
    #fill in NA
    df_train = df_train.fillna(0)
    df_test = df_test.fillna(0)
    
    y = df_train.SeriousDlqin2yrs
    X = df_train.drop(columns=['SeriousDlqin2yrs','index'])
    X_test = df_test.drop(columns=['SeriousDlqin2yrs','index'])
    
    assert resample
    if resample == 'ROS':
        X_resampled, y_resampled = RandomOverSampler(random_state=0).fit_sample(X,y)
    elif resample == "SMOTE":
        X_resampled, y_resampled = SMOTE(random_state=0).fit_sample(X,y)
    
    if scale:
        scaler = preprocessing.StandardScaler().fit(X_resampled)
        X_train = scaler.transform(X_resampled)
        X_test = scaler.transform(X_test)
    else:
        X_train = X_resampled
    
    return X_train,y_resampled,X_test

# Model training

To improve: consider over sampling with imbalance class


In [15]:
from sklearn.metrics import accuracy_score, f1_score,roc_auc_score,accuracy_score, confusion_matrix, roc_curve
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegressionCV

In [31]:
X_train,y_train,X_test = data_import_and_preprocess()

In [32]:
model = LogisticRegressionCV(Cs = [0.9,1,2,3],cv = 5,penalty =  'l2',max_iter = 1000,random_state = 0)

In [33]:
fitted_model,results = model_fit_train_score_skf(model,X_train,y_train)

In [34]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.714 
F1 score: 0.695 
AUC score: 0.790


Not bad, 10% improvement on AUC from the first attempt

In [36]:
df_test['Probability'] = fitted_model.predict_proba(X_test)[:,1]
columns_output = ['index','Probability']
output_df = pd.DataFrame(df_test[columns_output])
output_df = output_df.rename(columns={'index':"Id"})
save_path = 'G:/Github/GiveMeSomeCredits/output/2nd_sub.csv'
output_df.to_csv(save_path,index=False)

New models to consider:

* XGB classifier


In [40]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [51]:
clf2 = XGBClassifier(learning_rate=0.1, 
                    min_child_weight=3, 
                    colsample_bytree=0.9,
                    objective='binary:logistic')

In [52]:
fitted_model,results = model_fit_train_score_skf(clf2,X_train,y_train)

In [53]:
print(f"Accuracy: {results['Accuracy_mean']:.3f} \nF1 score: {results['F1_mean']:.3f} \nAUC score: {results['AUC_mean']:.3f}")

Accuracy: 0.789 
F1 score: 0.787 
AUC score: 0.869


In [54]:
df_test['Probability'] = fitted_model.predict_proba(X_test)[:,1]
columns_output = ['index','Probability']

output_df = pd.DataFrame(df_test[columns_output])
output_df = output_df.rename(columns={'index':"Id"})

save_path = 'G:/Github/GiveMeSomeCredits/output/3rd_sub.csv'
output_df.to_csv(save_path,index=False)

* Kaggle score: 0.8670

Still can be improved?

In [41]:
param_test1 = {'leraning_rate':[0.0001,0.001,0.01,0.1,0.2,0.3],
              'subsample':[0.2,0.5,0.9],
              'colsample_bytree':[0.5,0.7,0.9]
              }

gsearch1 = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic'),
                      param_grid=param_test1,
                       scoring='roc_auc',
                       n_jobs=-1, cv=5)

gsearch1.fit(X_train, y_train)
print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

{'mean_fit_time': array([10.55895991, 13.97043834, 14.32429152, 11.28601456, 13.7222333 ,
       11.8626718 , 10.17578464, 12.28973074, 10.86274648, 10.24898829,
       11.95981212, 10.56357474,  9.76727581, 12.29592152, 11.20144038,
       11.47191801, 11.60176969, 10.57571445, 11.38794241, 13.41056046,
       12.55143075, 11.98953295, 13.76200576, 12.4142096 , 11.3346848 ,
       13.71950636, 12.78620257, 11.87124829, 13.77755036, 12.63460784,
       11.85401688, 13.92527099, 12.62583046, 12.30570478, 13.4839416 ,
       12.3617383 , 13.09707713, 15.8858139 , 14.60435519, 13.76060128,
       16.40244002, 15.2918283 , 14.69350095, 16.46596107, 15.07458487,
       13.58537321, 15.69901118, 14.83492279, 13.84560614, 15.95732155,
       14.60433898, 13.53580637, 15.58463888, 13.68929391]), 'std_fit_time': array([0.23292051, 1.77732804, 0.6902639 , 0.79537967, 0.14033206,
       1.05774693, 0.2046242 , 0.11647685, 0.06183558, 0.15647441,
       0.10987383, 0.16322649, 0.13413567, 0.211535

In [65]:
df_test['Probability'] = gsearch1.predict_proba(X_test)[:,1]
columns_output = ['index','Probability']

output_df = pd.DataFrame(df_test[columns_output])
output_df = output_df.rename(columns={'index':"Id"})

save_path = 'G:/Github/GiveMeSomeCredits/output/4th_sub.csv'
output_df.to_csv(save_path,index=False)

* Kaggle private score: 0.8671

In [None]:
param_test2 = {'leraning_rate':[0.001,0.01,0.1,0.2,0.3],
              'subsample':[0.5,0.9,1],
              'colsample_bytree':[0.5,0.7,0.9,1],
               'min_child_weight':[1,2,3,4],
               'reg_lambda':[0.5,1,2],
               'reg_alpha':[0,0.5,1]
              }

gsearch2 = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic'),
                      param_grid=param_test2,
                       scoring='roc_auc',
                       n_jobs=-1, cv=5)

gsearch2.fit(X_train, y_train)
print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_)

In [None]:
df_test['Probability'] = gsearch2.predict_proba(X_test)[:,1]
columns_output = ['index','Probability']

output_df = pd.DataFrame(df_test[columns_output])
output_df = output_df.rename(columns={'index':"Id"})

save_path = 'G:/Github/GiveMeSomeCredits/output/5th_sub.csv'
output_df.to_csv(save_path,index=False)