In [1]:
# Import required modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Import original train set and Principal Components (PCs) obtained from PCA done in other notebook
df = pd.read_csv('train.csv')
pca_train = pd.read_csv('pca_train.csv')
pca_train.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6
0,0.532196,-0.372043,0.305666,-0.54659,0.135447,0.00973
1,-0.389258,-0.045656,-0.5736,-0.017898,-0.057655,-0.232852
2,-0.293952,-0.467042,0.134255,-0.177374,-0.261998,0.795718
3,-0.378499,0.394433,0.677732,-0.340884,-0.22023,-0.150057
4,0.532253,-0.371999,0.305718,-0.546631,0.135432,0.009699


In [3]:
# Convert the Categorical Y/N target variable 'Loan_Status' for binary 1/0 classification
df['Loan_Status'] = df['Loan_Status'].map(lambda x: 1 if x == 'Y' else 0)

In [4]:
# Set X and y for ML model training do train-test split using sklearn module

X = pca_train.values
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [5]:
y_test.shape

(93,)

In [6]:
X_train.shape

(521, 6)

In [7]:
# Initiate a new Adaptive Classifier, an ensemble boosting algorithm
ada = AdaBoostClassifier()

# Create a dictionary of all values we want to test for selected model parameters of the respective algorithm
params_ada = {'n_estimators': np.arange(1, 10)}

# Use GridSearchCV to test all values for selected model parameters
ada_gs = GridSearchCV(ada, params_ada, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')

# Fit model to training data
ada_gs.fit(X_train, y_train)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  90 | elapsed:    3.6s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    4.4s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [8]:
# Save the best model
ada_best = ada_gs.best_estimator_

# Check the value of the best selected model parameter(s)
print(ada_gs.best_params_)

{'n_estimators': 1}


In [9]:
# Print the accuracy score on the test data using best model
print('ada: {}'.format(ada_best.score(X_test, y_test)))

ada: 0.8064516129032258


In [10]:
# Initiate a new Gradient Boosting Classifier, an ensemble boosting algorithm
gbc = GradientBoostingClassifier(learning_rate=0.005,warm_start=True)

# Create a dictionary of all values we want to test for selected model parameters of the respective algorithm
params_gbc = {'n_estimators': np.arange(1, 200)}

# Use GridSearchCV to test all values for selected model parameters
gbc_gs = GridSearchCV(gbc, params_gbc, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')

# Fit model to training data
gbc_gs.fit(X_train, y_train)

Fitting 10 folds for each of 199 candidates, totalling 1990 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 738 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 1288 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 1990 out of 1990 | elapsed:  2.2min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.005, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sam...       subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=True),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': array([  1,   2, ..., 198, 199])},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [11]:
# Save the best model
gbc_best = gbc_gs.best_estimator_

# Check the value of the best selected model parameter(s)
print(gbc_gs.best_params_)

{'n_estimators': 176}


In [12]:
# Print the accuracy score on the test data using best model
print('gbc: {}'.format(gbc_best.score(X_test, y_test)))

gbc: 0.8064516129032258


In [13]:
# Initiate a new Bagging Classifier, an ensemble bagging algorithm 
bcdt = BaggingClassifier(DecisionTreeClassifier(random_state=1))

# Create a dictionary of all values we want to test for selected model parameters of the respective algorithm
params_bcdt = {'n_estimators': np.arange(1, 100)}

# Use GridSearchCV to test all values for selected model parameters
bcdt_gs = GridSearchCV(bcdt, params_bcdt, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')

# Fit model to training data
bcdt_gs.fit(X_train, y_train)

Fitting 10 folds for each of 99 candidates, totalling 990 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 990 out of 990 | elapsed:  2.2min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...stimators=10, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': array([ 1,  2, ..., 98, 99])},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [14]:
# Save the best model
bcdt_best = bcdt_gs.best_estimator_

# Check the value of the best selected model parameter(s)
print(bcdt_gs.best_params_)

{'n_estimators': 15}


In [15]:
# Print the accuracy score on the test data using best model
print('bcdt: {}'.format(bcdt_best.score(X_test, y_test)))

bcdt: 0.7741935483870968


In [16]:
# Initiate a new Decision Tree Classifier and follow the similar process as mentioned in comments above
dt = DecisionTreeClassifier(random_state=1)
params_dt = {}
dt_gs = GridSearchCV(dt, params_dt, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')
dt_gs.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1, param_grid={},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [17]:
# Save the best model and check best model parameters
dt_best = dt_gs.best_estimator_
print(dt_gs.best_params_)

{}


In [18]:
# Print the accuracy score on the test data using best model
print('dt: {}'.format(dt_best.score(X_test, y_test)))

dt: 0.7311827956989247


In [19]:
# Initiate a new Support Vector Classifier and follow the similar process as mentioned in comments above
svc = LinearSVC(random_state=1)
params_svc = {}
svc_gs = GridSearchCV(svc, params_svc, cv=10,verbose=1,n_jobs=-1,pre_dispatch='128*n_jobs')
svc_gs.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
     verbose=0),
       fit_params=None, iid='warn', n_jobs=-1, param_grid={},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [20]:
# Save the best model and check best model parameters
svc_best = svc_gs.best_estimator_
print(svc_gs.best_params_)

{}


In [21]:
# Print the accuracy score on the test data using best model
print('svc: {}'.format(svc_best.score(X_test, y_test)))

svc: 0.7956989247311828


In [22]:
# Initiate a new XG Boost Classifier, an ensemble boosting algorithm and follow the similar process as mentioned in comments above
xg = xgb.XGBClassifier(random_state=1,learning_rate=0.005)
params_xg = {'max_depth': np.arange(2,5), 'n_estimators': np.arange(1, 100)}
xg_gs = GridSearchCV(xg, params_xg, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')
xg_gs.fit(X_train, y_train)

Fitting 10 folds for each of 297 candidates, totalling 2970 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 738 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 1288 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 1938 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 2970 out of 2970 | elapsed:  1.3min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       learning_rate=0.005, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=1,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([2, 3, 4]), 'n_estimators': array([ 1,  2, ..., 98, 99])},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [23]:
# Save the best model and check best model parameters
xg_best = xg_gs.best_estimator_
print(xg_gs.best_params_)

# Print the accuracy score on the test data using best model
print('xg: {}'.format(xg_best.score(X_test, y_test)))

{'max_depth': 2, 'n_estimators': 2}
xg: 0.8064516129032258


In [24]:
# Initiate a new Light Gradient Boosted Machine, an ensemble boosting algorithm

# Set the train data and initiate ML training
train_data = lgb.Dataset(X_train,label=y_train)
params = {'learning_rate':0.01}
lgbm = lgb.train(params, train_data, 100) 

y_pred = lgbm.predict(X_test)

for i in range(0,y_test.shape[0]):
    if y_pred[i]>=0.5:
        y_pred[i]=1
    else:
        y_pred[i]=0

In [25]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,y_pred))

0.8064516129032258


In [26]:
# Initiate a new Cat Boost Classifier, an ensemble boosting algorithm and fit on train data
cbc = CatBoostClassifier(random_state=1, iterations=100)
cbc.fit(X_train, y_train)

Learning rate set to 0.06442
0:	learn: 0.6730757	total: 57.5ms	remaining: 5.69s
1:	learn: 0.6557088	total: 61.4ms	remaining: 3.01s
2:	learn: 0.6397677	total: 65.3ms	remaining: 2.11s
3:	learn: 0.6248063	total: 69.1ms	remaining: 1.66s
4:	learn: 0.6109242	total: 73ms	remaining: 1.39s
5:	learn: 0.5993601	total: 76.9ms	remaining: 1.21s
6:	learn: 0.5879616	total: 80.8ms	remaining: 1.07s
7:	learn: 0.5767795	total: 84.6ms	remaining: 973ms
8:	learn: 0.5673600	total: 88.4ms	remaining: 894ms
9:	learn: 0.5582118	total: 92.3ms	remaining: 831ms
10:	learn: 0.5509164	total: 96ms	remaining: 777ms
11:	learn: 0.5433785	total: 99.6ms	remaining: 731ms
12:	learn: 0.5365546	total: 103ms	remaining: 692ms
13:	learn: 0.5301993	total: 107ms	remaining: 658ms
14:	learn: 0.5234929	total: 111ms	remaining: 629ms
15:	learn: 0.5179924	total: 115ms	remaining: 602ms
16:	learn: 0.5125370	total: 118ms	remaining: 578ms
17:	learn: 0.5078143	total: 122ms	remaining: 557ms
18:	learn: 0.5035376	total: 126ms	remaining: 537ms
19:	

<catboost.core.CatBoostClassifier at 0x297002966c8>

In [27]:
# Print the overall accuracy
print('cbc: {}'.format(cbc.score(X_test, y_test)))

cbc: 0.8064516129032258


In [28]:
# Initiate a new KNeighbors Classifier and follow the similar process as mentioned in previous comments
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')
knn_gs.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 211 out of 240 | elapsed:    1.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    1.6s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [29]:
# Save the best model and check best model parameters
knn_best = knn_gs.best_estimator_
print(knn_gs.best_params_)

# Print the overall accuracy
print('knn: {}'.format(knn_best.score(X_test, y_test)))

{'n_neighbors': 15}
knn: 0.7956989247311828


In [30]:
# Initiate a new Random Forest Classifier, an ensemble bagging algorithm and follow the similar process as mentioned in previous comments
rf = RandomForestClassifier()
params_rf = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500]}
rf_gs = GridSearchCV(rf, params_rf, cv=10, verbose=1, n_jobs=-1, pre_dispatch='128*n_jobs')
rf_gs.fit(X_train, y_train)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  90 | elapsed:   10.0s remaining:   16.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   45.7s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500]},
       pre_dispatch='128*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [31]:
# Save the best model and check best model parameters
rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)

# Print the overall accuracy
print('rf: {}'.format(rf_best.score(X_test, y_test)))

{'n_estimators': 100}
rf: 0.7526881720430108


In [32]:
# Create a new Logistic Regression model and fit on train data
log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
# Print the overall accuracy
print('log_reg: {}'.format(log_reg.score(X_test, y_test)))

log_reg: 0.7956989247311828


In [34]:
# Print the overall accuracy score for all the 11 best classification models trained earlier
print('Overall Accuracy of best selected models on X_test dataset\n')
print('knn: {}'.format(knn_best.score(X_test, y_test)))
print('rf: {}'.format(rf_best.score(X_test, y_test)))
print('log_reg: {}'.format(log_reg.score(X_test, y_test)))
print('ada: {}'.format(ada_best.score(X_test, y_test)))
print('gbc: {}'.format(gbc_best.score(X_test, y_test)))
print('bcdt: {}'.format(bcdt_best.score(X_test, y_test)))
print('dt: {}'.format(dt_best.score(X_test, y_test)))
print('svc: {}'.format(svc_best.score(X_test, y_test)))
print('xg: {}'.format(xg_best.score(X_test, y_test)))
print('lgbm: {}'.format(metrics.accuracy_score(y_test,y_pred)))
print('cbc: {}'.format(cbc.score(X_test, y_test)))

Overall Accuracy of best selected models on X_test dataset

knn: 0.7956989247311828
rf: 0.7526881720430108
log_reg: 0.7956989247311828
ada: 0.8064516129032258
gbc: 0.8064516129032258
bcdt: 0.7741935483870968
dt: 0.7311827956989247
svc: 0.7956989247311828
xg: 0.8064516129032258
lgbm: 0.8064516129032258
cbc: 0.8064516129032258


In [35]:
# Create a dictionary of our models
estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg), ('ada', ada_best), ('gbc', gbc_best), ('bcdt', bcdt_best), ('dt', dt_best), ('xg', xg_best), ('cbc', cbc)]

# Create a voting classifier, input the dictionary of our models as estimators for the ensemble
ensemble = VotingClassifier(estimators, voting='soft', n_jobs=-1, flatten_transform=True, weights=[1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])

In [36]:
# Fit the Final Ensemble Model on train data
ensemble.fit(X_train, y_train)

# Test our final model on the test data and print our final accuracy score for the Ensemble made using Bagging and Boosting techniques
ensemble.score(X_test, y_test)

0.7956989247311828

In [37]:
# Import the PCs of test data for final predictions
dft = pd.read_csv('pca_test.csv')
dft.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6
0,-0.279674,-0.482089,0.281351,-0.216091,-0.031365,-0.142948
1,-0.387792,-0.465299,0.315794,-0.105767,-0.045882,-0.094015
2,-0.499361,-0.453305,0.346596,0.00804,-0.060596,-0.039688
3,-0.492062,-0.414947,0.365367,0.070814,0.079475,-0.018593
4,0.437441,0.509004,0.70613,-0.674169,-0.05447,-0.001153


In [38]:
# Assign the PCs dft to test_X
test_X = dft.values
print(len(test_X))

367


In [39]:
# Make final predictions on the test data
test_predictions = ensemble.predict(test_X)
test_predictions

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

In [40]:
# Import original test file for Loan_IDs and assign the test_predictions to a new column 'Loan_Status'
dft2 = pd.read_csv('test.csv')
dft2['Loan_Status'] = test_predictions

In [41]:
# Drop unnecessary columns
dft2 = dft2.drop(['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area'],axis=1)
dft2.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,0


In [42]:
# Convert binary 1/0 targets back to Categorical Y/N alphabets
dft2['Loan_Status'] = dft2['Loan_Status'].map(lambda x: 'Y' if x == 1 else 'N')
dft2.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,N


In [43]:
# Save the predictions from the final Ensemble on local disk
dft2.to_csv('Ensemble.csv', index=False)