# Loan default prediction

The dataset comes from [this kaggle competition](https://www.kaggle.com/c/credit-default-prediction-ai-big-data/overview)

Our approach:
* Standard imports
* Read in data and look at summary statistics
* EDA (distributions, relationships)
* Model selection, model building (may try a few)
* Answer the 'so what'

Let's define an approach:
* Read the pickled dataframe
* We'll start by just chucking the data through a naive bayes classifier 
* Preprocessing
    * Scaling
    * Shuffling
    * Splitting
* Modelling approaches 
    * GridCV search
    * Imbalanced data (SMOTE)
    * Optimizing for different metrics - probably F1
* Models
    * Naive Bayes (just as benchmark)
    * Logistic regression
    * Random forest

In [1]:
import pandas as pd
import numpy as np

# general
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

# scoring
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             f1_score,
                             roc_curve,
                             roc_auc_score)

# we'll also initialise the random state we're using here - no fancy config files
random_state = 99
test_size = 0.30

In [2]:
df = pd.read_pickle("./data_cleaned.pkl")

In [3]:
### PREPROCESSING ###

# Split features and target
X = df.drop("Credit Default", axis=1)
y = df["Credit Default"]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=test_size,
                                                    random_state=random_state)

In [4]:
# functions

def print_scores(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    print(f"""accuracy: {accuracy}
    precision: {precision}
    f1 score: {f1}
    """)
    
    pass

## Naive Bayes benchmarking - vanilla as

In [5]:
nb_model = make_pipeline(StandardScaler(),
                         GaussianNB())
nb_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

In [6]:
# print_scores(nb_model, X_test, y_test)

Pretty terrible at a first swing!

### Logistic regression

In [7]:
lg_model = make_pipeline(StandardScaler(),
                                   LogisticRegression(random_state=random_state,
                                                      verbose=2))
lg_model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=99,
                                    solver='lbfgs', tol=0.0001, verbose=2,
                                    warm_start=False))],
         verbose=False)

In [8]:
# print_scores(lg_model, X_test, y_test)

### Logistic regression CV

In [9]:
lg_cv_model = make_pipeline(StandardScaler(),
                                   LogisticRegressionCV(random_state=random_state,
                                                        verbose=3))
lg_cv_model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s finished


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='auto',
                                      n_jobs=None, penalty='l2',
                                      random_state=99, refit=True, scoring=None,
                                      solver='lbfgs', tol=0.0001, verbose=3))],
         verbose=False)

In [10]:
# print_scores(lg_cv_model, X_test, y_test)

### Random forest

In [11]:
rf_model = make_pipeline(StandardScaler(),
                         RandomForestClassifier(random_state=random_state,
                                                verbose=3)) # for now not specifying any parameters
rf_model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=99,
                                        verbose=3, warm_start=False))],
         verbose=Fal

In [12]:
# print_scores(rf_model, X_test, y_test)