In this data challenge, I will use Stacking Ensemble (Logistric Regression, Decision Tree Classifier, XGBoost) to predict whether a client will default depending their characteristics in the data. This will help a bank on deciding to give the loan or not.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import seaborn as sns

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/f2021-stat441-d1/441D1sam.csv
/kaggle/input/f2021-stat441-d1/441D1train.csv
/kaggle/input/f2021-stat441-d1/441D1test.csv


# Import and Overview of the Training Dataset

Load data

In [2]:
bank_df = pd.read_csv("/kaggle/input/f2021-stat441-d1/441D1train.csv")

In [3]:
bank_df.shape

(80000, 10)

In [4]:
bank_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,80000.0,39999.5,23094.16,0.0,19999.75,39999.5,59999.25,79999.0
Income,80000.0,5082973.0,2698214.0,50.927627,3079955.0,4981044.0,6920796.0,17102240.0
Age,80000.0,49.96355,7.091645,24.0,45.0,50.0,55.0,83.0
Experience,80000.0,10.09044,3.178079,0.0,8.0,10.0,12.0,28.0
Married,80000.0,0.1013125,0.301744,0.0,0.0,0.0,0.0,1.0
House_Ownership,80000.0,0.05185,0.2217255,0.0,0.0,0.0,0.0,1.0
Car_Ownership,80000.0,0.2989875,0.4578172,0.0,0.0,0.0,1.0,1.0
CURRENT_JOB_YRS,80000.0,6.340075,2.515254,0.0,5.0,6.0,8.0,19.0
CURRENT_HOUSE_YRS,80000.0,11.99527,3.455854,1.0,10.0,12.0,14.0,29.0
Default,80000.0,0.499225,0.5000025,0.0,0.0,0.0,1.0,1.0


* Id - Client id
* Income - Annual Income
* Age - Client age
* Experience - Years of experience
* Married - 1 for married, 0 for not
* House_Ownership - 1 for own a house, 0 for not
* Car_Ownership - 1 for own a car, 0 for not
* CURRENT_JOB_YRS - Number of years at current job
* CURRENT_HOUSE_YEARS - Number of years at current residence (Owned or rented)
* Default- 1 for default on the loan, 0 for not

# Splitting data 

We now split the data into train and testing sets. 

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

features = list(bank_df.columns)

X = bank_df[features[1:-1]].values
y = bank_df[features[-1]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 16)

In [6]:
print(features)

['Id', 'Income', 'Age', 'Experience', 'Married', 'House_Ownership', 'Car_Ownership', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Default']


In [7]:
X_train.shape
y_train.shape

(64000,)

# Train Base Learners

In [8]:
# Base Learners
cls_1 = LogisticRegression()
cls_2 = DecisionTreeClassifier(random_state=16)

# fit cls_1 on X_train
cls_1.fit(X_train, y_train)
# fit cls_2 on X_train
cls_2.fit(X_train, y_train);

# Test Base Learners

In [9]:
X_test_pred_1 = cls_1.predict(X_test)
print("Classification report for base learner 1\n")
print(classification_report(y_test, X_test_pred_1,
                            digits = 4,
                            target_names=["No Default",
                                          "Default"]))

X_test_pred_2 = cls_2.predict(X_test)
print("\n\nClassification report for base learner 2\n")
print(classification_report(y_test, X_test_pred_2,
                            digits = 4,
                            target_names=["No Default",
                                          "Default"]))

Classification report for base learner 1

              precision    recall  f1-score   support

  No Default     0.4949    1.0000    0.6622      7919
     Default     0.0000    0.0000    0.0000      8081

    accuracy                         0.4949     16000
   macro avg     0.2475    0.5000    0.3311     16000
weighted avg     0.2450    0.4949    0.3277     16000



Classification report for base learner 2

              precision    recall  f1-score   support

  No Default     0.9707    0.9689    0.9698      7919
     Default     0.9696    0.9713    0.9705      8081

    accuracy                         0.9701     16000
   macro avg     0.9701    0.9701    0.9701     16000
weighted avg     0.9701    0.9701    0.9701     16000



  _warn_prf(average, modifier, msg_start, len(result))


# Implementing XGBClassifier

In [10]:
xgbc = xgb.XGBClassifier()

In [11]:
xgbc.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [12]:
score = xgbc.score(X_train, y_train)
print("Training score: ", score)

Training score:  0.99946875


In [13]:
from sklearn.model_selection import cross_val_score, KFold

cv_score = cross_val_score(xgbc, X_train, y_train, cv=5)
print("CV mean score: ", cv_score.mean())





CV mean score:  0.9897031249999999


In [14]:
y_pred = xgbc.predict(X_test)
y_predictions = [float(value) for value in y_pred]

In [15]:
cr = classification_report(y_test,y_predictions, digits = 4)
print(cr)

              precision    recall  f1-score   support

         0.0     0.9925    0.9875    0.9900      7919
         1.0     0.9878    0.9927    0.9902      8081

    accuracy                         0.9901     16000
   macro avg     0.9902    0.9901    0.9901     16000
weighted avg     0.9901    0.9901    0.9901     16000



# Make Predictions for Test Data

In [16]:
test_data = pd.read_csv('/kaggle/input/f2021-stat441-d1/441D1test.csv')

In [17]:
features = list(test_data.columns)
X = test_data[features[1:]].values

In [18]:
X

array([[4.42171440e+06, 4.20000000e+01, 1.00000000e+01, ...,
        0.00000000e+00, 6.00000000e+00, 1.20000000e+01],
       [6.30514344e+05, 5.70000000e+01, 6.00000000e+00, ...,
        0.00000000e+00, 9.00000000e+00, 8.00000000e+00],
       [1.79154564e+06, 4.40000000e+01, 1.10000000e+01, ...,
        1.00000000e+00, 5.00000000e+00, 1.00000000e+01],
       ...,
       [8.22111810e+06, 5.20000000e+01, 1.40000000e+01, ...,
        1.00000000e+00, 7.00000000e+00, 1.10000000e+01],
       [3.31848415e+06, 5.10000000e+01, 1.20000000e+01, ...,
        0.00000000e+00, 8.00000000e+00, 1.70000000e+01],
       [5.41745394e+06, 5.90000000e+01, 1.00000000e+01, ...,
        0.00000000e+00, 6.00000000e+00, 8.00000000e+00]])

In [19]:
y_pred = xgbc.predict(X)
y_predictions = [float(value) for value in y_pred]

In [20]:
test_data = pd.DataFrame({'Id' : np.arange(0,X.shape[0]), 'Default' : y_predictions})

In [21]:
print(test_data)

          Id  Default
0          0      1.0
1          1      1.0
2          2      1.0
3          3      0.0
4          4      1.0
...      ...      ...
19995  19995      0.0
19996  19996      0.0
19997  19997      0.0
19998  19998      0.0
19999  19999      0.0

[20000 rows x 2 columns]


In [22]:
test_data.to_csv('submission1.csv', index = False)