## Importing Libraries

In [31]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import log_loss
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

import warnings
warnings.simplefilter(action='ignore')
pd.set_option('display.max_columns', None)

In [6]:
# Reading data

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/submission.csv')

train.head()

Unnamed: 0,ID,Loan Amount,Funded Amount,Funded Amount Investor,Term,Batch Enrolled,Interest Rate,Grade,Sub Grade,Employment Duration,Home Ownership,Verification Status,Payment Plan,Loan Title,Debit to Income,Delinquency - two years,Inquires - six months,Open Account,Public Record,Revolving Balance,Revolving Utilities,Total Accounts,Initial List Status,Total Received Interest,Total Received Late Fee,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Application Type,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,65087372,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,176346.6267,Not Verified,n,Debt Consolidation,16.284758,1,0,13,0,24246,74.932551,7,w,2929.646315,0.102055,2.498291,0.793724,0,INDIVIDUAL,49,0,31,311301,6619,0
1,1450153,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,39833.921,Source Verified,n,Debt consolidation,15.412409,0,0,12,0,812,78.297186,13,f,772.769385,0.036181,2.377215,0.974821,0,INDIVIDUAL,109,0,53,182610,20885,0
2,1969101,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,91506.69105,Source Verified,n,Debt Consolidation,28.137619,0,0,14,0,1843,2.07304,20,w,863.324396,18.77866,4.316277,1.020075,0,INDIVIDUAL,66,0,34,89801,26155,0
3,6651430,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,108286.5759,Source Verified,n,Debt consolidation,18.04373,1,0,7,0,13819,67.467951,12,w,288.173196,0.044131,0.10702,0.749971,0,INDIVIDUAL,39,0,40,9189,60214,0
4,14354669,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,44234.82545,Source Verified,n,Credit card refinancing,17.209886,1,3,13,1,1544,85.250761,22,w,129.239553,19.306646,1294.818751,0.368953,0,INDIVIDUAL,18,0,430,126029,22579,0


In [28]:
# Utility Functions

# Save predictions as csv file for submission
def save_submission(predictions, filename=None):
    submission['Loan Status'] = predictions
    if not filename:
        n = len(os.listdir('submissions'))
        filename = f"submissions/submission{n}.csv"
        
    submission.to_csv(filename, index=False)

# Save model as pickle file
def save_model(model):
    n = len(os.listdir('models'))
    file_name = f"models/model{n}.pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
        
# K fold cross validation
def run_kfold(model, X, y, test, k=5):
    oof_preds = np.zeros(len(train))
    test_preds = np.zeros(len(test))
    folds = StratifiedKFold(n_splits=k)
    # ------------------------------------------#
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print(f'\n------------- Fold {fold_ + 1} -------------')
        # Training Set
        X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        # Validation Set
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        # Test Set
        X_test = test
        # Train
        model.fit(X_trn, y_trn)
        # Predict
        preds_val = model.predict_proba(X_val)[:, 1]
        preds_test = model.predict_proba(X_test)[:, 1]
        # Evaluate
        fold_score = log_loss(y_val, preds_val)
        print(f'\nLog loss for validation set is {fold_score}')
        # predictions
        oof_preds[val_idx] = preds_val
        test_preds += preds_test/k
    # OOF score
    oofs_score = log_loss(y, oof_preds)
    print(f'\n\nLog loss for oof_preds is {oofs_score}')
    return oof_preds, test_preds

## Data Preprocessing

In [8]:
# feature/column names

train.columns = train.columns.str.lower().str.replace(' ','_').str.replace('-','_')
test.columns = test.columns.str.lower().str.replace(' ','_').str.replace('-','_')

In [9]:
# Drop Arbitrary columns
train.drop(['accounts_delinquent', 'payment_plan'], axis=1, inplace=True)

In [10]:
#------- Variable Separation ---------#

cat_features = [feat for feat in train.columns if feat not in ['id', 'loan_status'] and train[feat].dtype=='O']
num_features = [feat for feat in train.columns if feat not in ['id', 'loan_status'] and train[feat].dtype!='O']

features = cat_features+num_features
target = 'loan_status'

discrete_features = ['term', 'delinquency___two_years', 'inquires___six_months', 'open_account', 'public_record', 'total_accounts', 'collection_12_months_medical', 'last_week_pay']
continuous_features = [feat for feat in num_features if feat not in discrete_features]

## Feature Engineering

In [11]:
# Feature Scaling

scaler = StandardScaler()

train[continuous_features] = scaler.fit_transform(train[continuous_features])
test[continuous_features] = scaler.transform(test[continuous_features])

scaler2 = MinMaxScaler()

train[discrete_features] = scaler2.fit_transform(train[discrete_features])
test[discrete_features] = scaler2.transform(test[discrete_features])

In [12]:
# Categorical encoding

grade_dict = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7}
train.grade = train.grade.replace(grade_dict)
test.grade = test.grade.replace(grade_dict)


sub_grade_dict ={'A1': 1.1, 'A2': 1.2, 'A3': 1.3, 'A4': 1.4, 'A5': 1.5,
                 'B1': 2.1, 'B2': 2.2, 'B3': 2.3, 'B4': 2.4, 'B5': 2.5, 
                 'C1': 3.1, 'C2': 3.2, 'C3': 3.3, 'C4': 3.4, 'C5': 3.5, 
                 'D1': 4.1, 'D2': 4.2, 'D3': 4.3, 'D4': 4.4, 'D5': 4.5,
                 'E1': 5.1, 'E2': 5.2, 'E3': 5.3, 'E4': 5.4, 'E5': 5.5,
                 'F1': 6.1, 'F2': 6.2, 'F3': 6.3, 'F4': 6.4, 'F5': 6.5,
                 'G1': 7.1, 'G2': 7.2, 'G3': 7.3, 'G4': 7.4, 'G5': 7.5}
train.sub_grade = train.sub_grade.replace(sub_grade_dict)
test.sub_grade = test.sub_grade.replace(sub_grade_dict)


home_dict = {'OWN': 1, 'MORTGAGE': 2, 'RENT': 3}
train.employment_duration = train.employment_duration.replace(home_dict)
test.employment_duration = test.employment_duration.replace(home_dict)


status_dict = {'Not Verified':0, 'Source Verified':1, 'Verified':1}
train.verification_status = train.verification_status.replace(status_dict)
test.verification_status = test.verification_status.replace(status_dict)


train.loan_title = train.loan_title.str.lower()
test.loan_title = test.loan_title.str.lower()
title_dict = {'credit card loan': 'credit card',
              'credit card pay off': 'credit card',
              'credit card refi': 'credit card',
              'credit card refinance loan': 'credit card',
              'credit payoff': 'credit card',
              'credit consolidation': 'credit card',
              'credit card consolidation': 'credit card',
              'credit card payoff': 'credit card',
              'credit card debt': 'credit card',
              'credit card': 'credit card',
              'credit card refinancing': 'credit card',
              'credit cards': 'credit card',
              'credit card refinance': 'credit card',
              'credit': 'credit card',
              'credit card paydown': 'credit card',
              'credit pay off': 'credit card',
              'credit loan': 'credit card',
              'loan consolidation': 'consolidation',
              'dept consolidation': 'consolidation',
              'cc consolidation': 'consolidation',
              'consolidation loan': 'consolidation',
              'debt consolidation': 'consolidation',
              'consolidation': 'consolidation',
              'debt consolidation loan': 'consolidation',
              'consolidate': 'consolidation',
              'bill consolidation': 'consolidation',
              'card consolidation': 'consolidation',
              'debt consolidation 2013': 'consolidation',
              'consolidated': 'consolidation',
              "home improvement": "home",
              "home buying": "home",
              "major purchase": "purchase",
              "medical expenses": "medical",
              "moving and relocation": "relocation",
              "car financing": "car",
              "personal loan": "personal",
              "debt loan": "debt",
              "my loan": "personal",
              "cards": "credit card",
              }
train.loan_title = train.loan_title.replace(title_dict)
test.loan_title = test.loan_title.replace(title_dict)


train.initial_list_status = train.initial_list_status.replace({'w':0, 'f':1})
test.initial_list_status = test.initial_list_status.replace({'w':0, 'f':1})


train.application_type = train.application_type.replace({'INDIVIDUAL':1, 'JOINT':0})
test.application_type = test.application_type.replace({'INDIVIDUAL':1, 'JOINT':0})

In [13]:
# One hot encoding of cat features

enc = OneHotEncoder()

X = enc.fit_transform(train[features])
y = train[target]
test = enc.transform(test[features])

X.head()

Unnamed: 0,grade,sub_grade,employment_duration,verification_status,initial_list_status,application_type,loan_amount,funded_amount,funded_amount_investor,term,interest_rate,home_ownership,debit_to_income,delinquency___two_years,inquires___six_months,open_account,public_record,revolving_balance,revolving_utilities,total_accounts,total_received_interest,total_received_late_fee,recoveries,collection_recovery_fee,collection_12_months_medical,last_week_pay,total_collection_amount,total_current_balance,total_revolving_credit_limit,batch_enrolled_BAT2522922,batch_enrolled_BAT1586599,batch_enrolled_BAT2136391,batch_enrolled_BAT2428731,batch_enrolled_BAT5341619,batch_enrolled_BAT4694572,batch_enrolled_BAT4808022,batch_enrolled_BAT2558388,batch_enrolled_BAT2078974,batch_enrolled_BAT2252229,batch_enrolled_BAT2333412,batch_enrolled_BAT5849876,batch_enrolled_BAT2833642,batch_enrolled_BAT2803411,batch_enrolled_BAT5525466,batch_enrolled_BAT5714674,batch_enrolled_BAT2003848,batch_enrolled_BAT4722912,batch_enrolled_BAT3873588,batch_enrolled_BAT1780517,batch_enrolled_BAT4271519,batch_enrolled_BAT5811547,batch_enrolled_BAT1184694,batch_enrolled_BAT4136152,batch_enrolled_BAT3193689,batch_enrolled_BAT1467036,batch_enrolled_BAT2575549,batch_enrolled_BAT4351734,batch_enrolled_BAT1104812,batch_enrolled_BAT5924421,batch_enrolled_BAT1930365,batch_enrolled_BAT1766061,batch_enrolled_BAT5489674,batch_enrolled_BAT3865626,batch_enrolled_BAT5629144,batch_enrolled_BAT5547201,batch_enrolled_BAT224923,batch_enrolled_BAT3726927,batch_enrolled_BAT3461431,batch_enrolled_BAT1761981,batch_enrolled_BAT1135695,loan_title_consolidation,loan_title_credit card,loan_title_home,loan_title_green loan,loan_title_other,loan_title_relocation,loan_title_medical,loan_title_refinance,loan_title_lending club,loan_title_purchase,loan_title_vacation,loan_title_business,loan_title_personal,loan_title_cc refi,loan_title_loan 1,loan_title_car,loan_title_debt,loan_title_freedom,loan_title_get out of debt,loan_title_bathroom,loan_title_refi,loan_title_house,loan_title_cc refinance,loan_title_payoff,loan_title_get debt free,loan_title_myloan,loan_title_loan,loan_title_bill payoff,loan_title_cc-refinance,loan_title_debt reduction,loan_title_medical loan,loan_title_wedding loan,loan_title_pay off bills,loan_title_refinance loan,loan_title_debt payoff,loan_title_car loan,loan_title_pay off,loan_title_pool,loan_title_cc loan,loan_title_debt free,loan_title_conso,loan_title_home improvement loan,loan_title_lending loan,loan_title_relief,loan_title_cc,loan_title_loan1,loan_title_getting ahead,loan_title_home loan,loan_title_bills
0,2,3.4,2,0,0,1,-0.818483,2.020064,-0.337854,1.0,-0.191268,2.127642,-0.829943,0.125,0.0,0.314286,0.0,2.111596,0.977986,0.044118,0.38735,-0.198674,-0.160195,-0.094966,0.0,0.304348,-0.15512,1.091309,-0.789041,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3,4.3,3,1,1,1,-1.582243,-0.469958,-0.358098,1.0,0.105229,-0.904035,-0.933158,0.0,0.0,0.285714,0.0,-0.878926,1.127265,0.132353,-0.583384,-0.211235,-0.160534,-0.043073,0.0,0.677019,-0.125565,0.165689,-0.106997,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6,4.4,2,1,0,1,1.365603,-0.792498,1.028905,1.0,0.188142,0.243515,0.57247,0.0,0.0,0.342857,0.0,-0.747355,-2.25457,0.235294,-0.542629,3.362623,-0.155103,-0.030106,0.0,0.409938,-0.15109,-0.501847,0.144957,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,3.3,2,1,0,1,-0.678661,-1.081668,0.479766,1.0,1.313651,0.616163,-0.621824,0.125,0.0,0.142857,0.0,0.780958,0.646804,0.117647,-0.801484,-0.209719,-0.166892,-0.107503,0.0,0.242236,-0.14303,-1.081655,1.773285,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3,4.4,2,1,0,1,0.004911,-0.312185,-0.159444,1.0,0.850331,-0.806299,-0.720484,0.125,0.6,0.314286,0.25,-0.785512,1.435774,0.264706,-0.873015,3.463301,3.45951,-0.216682,0.0,0.111801,0.380899,-0.241275,-0.026008,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Modeling

In [14]:
# train test split
X_trn, X_cv, y_trn, y_cv = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

In [19]:
# Logistic regression
clf = LogisticRegression(random_state = 1, max_iter=500)
clf.fit(X_trn, y_trn)
preds_cv = clf.predict_proba(X_cv)[:,1]
print(log_loss(y_cv, preds_cv))

predictions = clf.predict_proba(test)[:,1]
save_submission(predictions, filename="logistic.csv")

0.308886004217454


In [25]:
# Decision tree
clf = DecisionTreeClassifier(random_state = 1)
clf.fit(X_trn, y_trn)
preds_cv = clf.predict_proba(X_cv)[:,1]
print(log_loss(y_cv, preds_cv))

predictions = clf.predict_proba(test)[:,1]
save_submission(predictions, filename="dtree.csv")

6.071814918450047


In [27]:
# Hyper Parameter Tuning Decision tree

hyperparam_combs = {
    'max_depth': [4, 6, 8, 10, 12, 14, 16],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128,256],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 4}, {0: 1, 1: 5}]
}

clf = RandomizedSearchCV(DecisionTreeClassifier(random_state=1),
                         hyperparam_combs,
                         scoring='neg_log_loss',
                         random_state=1,
                         n_iter=20)

search = clf.fit(X, y)

best_clf = search.best_estimator_

preds_cv = best_clf.predict_proba(X_cv)[:, 1]
print(log_loss(y_cv, preds_cv))

preds_test = best_clf.predict_proba(test)[:, 1]
save_submission(preds_test, filename='dtree_tuned.csv')

0.3081660991275356


In [32]:
# K fold dtree
params = search.best_params_
model = DecisionTreeClassifier(**params)

oof_preds, test_preds = run_kfold(model, X, y, test, k=5)
save_submission(test_preds, filename='dtree_tuned_5fold.csv')


------------- Fold 1 -------------

Log loss for validation set is 0.30830096754328684

------------- Fold 2 -------------

Log loss for validation set is 0.308309545469806

------------- Fold 3 -------------

Log loss for validation set is 0.3089780129266427

------------- Fold 4 -------------

Log loss for validation set is 0.3131217368017705

------------- Fold 5 -------------

Log loss for validation set is 0.31085339791275085


Log loss for oof_preds is 0.30991267062053535


In [34]:
# LightGBM classifier

clf = LGBMClassifier(random_state=1)
clf.fit(X_trn, y_trn)

preds_cv = clf.predict_proba(X_cv)[:,1]
print(log_loss(y_cv, preds_cv))

predictions = clf.predict_proba(test)[:,1]
save_submission(test_preds, filename='lgbm.csv')

0.31070368180002655


In [35]:
# K fold Lgbm classifier

clf = LGBMClassifier(random_state=1)
oof_preds, test_preds = run_kfold(clf, X, y, test, k=5)
save_submission(test_preds, filename='lgbm_5fold.csv')


------------- Fold 1 -------------

Log loss for validation set is 0.3124372107852419

------------- Fold 2 -------------

Log loss for validation set is 0.31176528397243125

------------- Fold 3 -------------

Log loss for validation set is 0.31100044934380144

------------- Fold 4 -------------

Log loss for validation set is 0.3102109433523473

------------- Fold 5 -------------

Log loss for validation set is 0.31178826947744553


Log loss for oof_preds is 0.31144044445489766


In [37]:
# Random Forest Classifier

clf = RandomForestClassifier(random_state = 1)
clf.fit(X_trn, y_trn)
preds_cv = clf.predict_proba(X_cv)[:,1]
print(log_loss(y_cv, preds_cv))

predictions = clf.predict_proba(test)[:,1]
save_submission(predictions, filename='rf2.csv')

0.31631523948710755


In [39]:
# Stacking Classifier

models = [('log', LogisticRegression(random_state=1)),
          ('dtree', DecisionTreeClassifier(random_state=1)),
          ('rf', RandomForestClassifier(random_state=1)),
          ('lgbm', LGBMClassifier(random_state=1))]

clf = StackingClassifier(estimators=models, 
                         final_estimator=LogisticRegression(random_state=101), 
                         cv=5, 
                         stack_method='predict_proba')

clf.fit(X_trn, y_trn)
preds_cv = clf.predict_proba(X_cv)[:,1]
print(log_loss(y_cv, preds_cv))

predictions = clf.predict_proba(test)[:,1]
save_submission(predictions, filename="stack_base.csv")

0.30802571774503285


In [42]:
# 5 fold Stacking Classifier

model = StackingClassifier(estimators=models,
                           final_estimator=LogisticRegression(random_state=1),
                           cv=3, 
                           stack_method='predict_proba')

oof_preds, test_preds = run_kfold(model, X, y, test, k=5)
save_submission(test_preds, filename='stack_5fold.csv')


------------- Fold 1 -------------

Log loss for validation set is 0.30835684757676224

------------- Fold 2 -------------

Log loss for validation set is 0.30812142156935146

------------- Fold 3 -------------

Log loss for validation set is 0.30813100333640425

------------- Fold 4 -------------

Log loss for validation set is 0.30793821781675573

------------- Fold 5 -------------

Log loss for validation set is 0.3079841177059743


Log loss for oof_preds is 0.30810632590426296
