In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [86]:
train = pd.read_csv('train_final.csv')
test = pd.read_csv('test_final.csv')

#### Unique ID

Making a copy of the columns for result submission and then dropping it.

In [87]:
uniqueID = test['UniqueID']
train.drop(['UniqueID'], axis = 1, inplace = True)
test.drop(['UniqueID'], axis = 1, inplace = True)

# Modelling

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

In [5]:
X = train.drop(['loan_default'], axis = 1)
y = train['loan_default']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## Round 1 - Including the 14 Unexplored Features

### Random Forest

In [34]:
clf = RandomForestClassifier(n_estimators = 20, verbose = 1)

In [35]:
clf = clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   16.5s finished


In [36]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.4s finished


In [37]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7702815723445776
Precision: 0.3409350057012543
Recall: 0.058893047075044315
F1-Score: 0.10043668122270741


In [38]:
confusion_matrix(y_test, y_pred)

array([[35321,  1156],
       [ 9556,   598]], dtype=int64)

So the model is doing..decently. It has trouble predicting the actual truth values though. Most of it's predictions are wrong.

In [39]:
prediction = clf.predict(test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.9s finished


In [40]:
output = pd.DataFrame({'UniqueID':uniqueID,'loan_default':prediction})

In [41]:
output.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,0
2,758529,0
3,763449,0
4,708663,0


In [42]:
output.to_csv('submissions/submission1.csv', index=False)

### AdaBoost

In [15]:
from sklearn.ensemble import AdaBoostClassifier

In [16]:
clf = AdaBoostClassifier()

In [17]:
clf = clf.fit(X_train, y_train)

In [18]:
y_pred = clf.predict(X_test)

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7818189616349639
Precision: 0.4358974358974359
Recall: 0.006696868229269254
F1-Score: 0.013191076624636276


In [20]:
confusion_matrix(y_test, y_pred)

array([[36389,    88],
       [10086,    68]], dtype=int64)

Gonna stop here as this already seems pretty bad. It's more or less just predicting almost everything as a default 0 with a couple of actual 1s. And even that has only a ~50% success rate.

### Naive-Bayes

Doubt this will actually do well since not all these variables are independent, but just an attempt.

In [21]:
from sklearn.naive_bayes import GaussianNB

In [22]:
clf = GaussianNB()

In [23]:
clf = clf.fit(X_train, y_train)

In [24]:
y_pred = clf.predict(X_test)

In [25]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7710750359203106
Precision: 0.09233176838810642
Recall: 0.005810518022454205
F1-Score: 0.010933012137496525


In [26]:
confusion_matrix(y_test, y_pred)

array([[35897,   580],
       [10095,    59]], dtype=int64)

Pretty much as expected. Moving on.

### Multi Layer Perceptron

In [43]:
from sklearn.neural_network import MLPClassifier

In [71]:
clf = MLPClassifier()

In [72]:
clf = clf.fit(X_train, y_train)

In [73]:
y_pred = clf.predict(X_test)

In [74]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.57386716990843
Precision: 0.22961767488452334
Recall: 0.40634232814654325
F1-Score: 0.29342531024428403


In [75]:
confusion_matrix(y_test, y_pred)

array([[22634, 13843],
       [ 6028,  4126]], dtype=int64)

I mean even with the huge number of bad predictions, this seems to have learned better than the other approaches. As suggested by the F-Score.

In [76]:
prediction = clf.predict(test)

In [77]:
output = pd.DataFrame({'UniqueID':uniqueID,'loan_default':prediction})

In [78]:
output.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,1
2,758529,0
3,763449,1
4,708663,0


In [79]:
output.to_csv('submissions/submission2.csv', index=False)

## Round 2 - No Unexplored Features

In [88]:
train.drop(['PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 
            'PRI.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 
            'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT'], 
           inplace = True, axis = 1)
test.drop(['PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 
            'PRI.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 
            'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'SEC.INSTAL.AMT'],
           inplace = True, axis = 1)

In [89]:
X = train.drop(['loan_default'], axis = 1)
y = train['loan_default']

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## Round 1 - Including the 14 Unexplored Features

### Random Forest

In [91]:
clf = RandomForestClassifier(n_estimators = 20, verbose = 1)

In [92]:
clf = clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   10.2s finished


In [93]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.3s finished


In [94]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7694023289228196
Precision: 0.34077618288144607
Recall: 0.06312783139649399
F1-Score: 0.10652264229331117


In [95]:
confusion_matrix(y_test, y_pred)

array([[35237,  1240],
       [ 9513,   641]], dtype=int64)

Not really much of a difference to approach 1.

### AdaBoost

In [97]:
from sklearn.ensemble import AdaBoostClassifier

In [98]:
clf = AdaBoostClassifier()

In [99]:
clf = clf.fit(X_train, y_train)

In [100]:
y_pred = clf.predict(X_test)

In [101]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7818404065964701
Precision: 0.24324324324324326
Recall: 0.0008863502068150482
F1-Score: 0.001766264350897851


In [102]:
confusion_matrix(y_test, y_pred)

array([[36449,    28],
       [10145,     9]], dtype=int64)

This is just terrible.

### Naive-Bayes

Doubt this will actually do well since not all these variables are independent, but just an attempt.

In [103]:
from sklearn.naive_bayes import GaussianNB

In [104]:
clf = GaussianNB()

In [105]:
clf = clf.fit(X_train, y_train)

In [106]:
y_pred = clf.predict(X_test)

In [107]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7393579378525016
Precision: 0.3061264055835595
Recall: 0.15550521961788458
F1-Score: 0.20624346917450367


In [108]:
confusion_matrix(y_test, y_pred)

array([[32898,  3579],
       [ 8575,  1579]], dtype=int64)

Not too surprised, those variables might have been correlated and all that.

In [109]:
prediction = clf.predict(test)

In [110]:
output = pd.DataFrame({'UniqueID':uniqueID,'loan_default':prediction})

In [111]:
output.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,0
2,758529,0
3,763449,0
4,708663,0


In [112]:
output.to_csv('submissions/submission3.csv', index=False)

### Multi Layer Perceptron

In [113]:
from sklearn.neural_network import MLPClassifier

In [114]:
clf = MLPClassifier()

In [115]:
clf = clf.fit(X_train, y_train)

In [116]:
y_pred = clf.predict(X_test)

In [117]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Accuracy: 0.7822478608650898
Precision: 0.0
Recall: 0.0
F1-Score: 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [118]:
confusion_matrix(y_test, y_pred)

array([[36477,     0],
       [10154,     0]], dtype=int64)

Well that backfired lol