In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import math
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import uniform, randint

df = pd.read_csv("lending.csv")


In [None]:
small_df = df
small_df['int_rate'] = small_df['int_rate'].str.replace("%", "").astype(float) / 100

In [None]:

# Features that we want to leave untransformed:
# Only numeric features need apply:
standardize_and_impute_only = [
'loan_amnt'
, 'annual_inc'
, 'delinq_2yrs'
, 'inq_last_6mths'
, 'open_acc'
]


# maybe we will trying some polynomial transofmrations
polynomial_features = [
'installment'
, 'dti'
]

# how about some discretizations?
discretization = [
'fico_range_low'
, 'fico_range_high'
, 'mths_since_last_delinq'
, 'mths_since_last_record'
, 'pub_rec'
, 'revol_bal'
, 'total_acc'
]

# let's construct all of our transformers
standard_scaler = StandardScaler()
missing_indicator = MissingIndicator(features="all")
simple_imputer = SimpleImputer(strategy='median')
polynomial_featurizer = PolynomialFeatures(2)
discretizer = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='uniform')

# and make the pipelines
standardize_and_impute_pipeline_steps = [('standardization', standard_scaler), ('imputer', simple_imputer)]
standardize_and_impute_pipeline = Pipeline(standardize_and_impute_pipeline_steps)

polynomial_pipeline_steps = standardize_and_impute_pipeline_steps + [('polynomial', polynomial_featurizer)]
polynomial_pipeline = Pipeline(polynomial_pipeline_steps)

discretize_steps = [('imputer', simple_imputer), ('discretize', discretizer)]
discretize_pipeline = Pipeline(discretize_steps)

interest_rate_steps = standardize_and_impute_pipeline_steps
interest_rate_pipeline = Pipeline(interest_rate_steps)

missing_flag_steps = [('missing_flag', missing_indicator)]
missing_flag_pipeline = Pipeline(missing_flag_steps)

#now we can use a ColumnTransformer to do everything
transform_pipeline = ColumnTransformer([
    ('standardize_and_impute_pipeline', standardize_and_impute_pipeline, standardize_and_impute_only)
     , ('polynomial_pipeline', polynomial_pipeline, polynomial_features)
     , ('discretize_pipeline', discretize_pipeline, discretization)
     , ('interest_rate_pipeline', interest_rate_pipeline, ['int_rate'])
     , ('missing_flag_pipeline', missing_flag_pipeline, standardize_and_impute_only + polynomial_features + discretization)
])

X = small_df.drop(['loan_status'], axis = 1)
y = small_df[['loan_status']]

In [None]:

rf = ensemble.RandomForestClassifier()

pipeline_rf = Pipeline([('transform', transform_pipeline)
                     , ('rf', rf)
                    ]
                   )


parameters_rf = {'rf__max_depth': [1, 3, 8]
      , 'rf__min_samples_leaf' : [5,50,1000]
      }

rf_pipeline = GridSearchCV(pipeline_rf, parameters_rf, cv=5, scoring='roc_auc', verbose=3, n_jobs=6)

rf_pipeline.fit(X, y.values.ravel())

print(rf_pipeline.best_params_)


In [None]:


rf_probs = rf_pipeline.predict_proba(X)
rf_preds = rf_probs[:,1]
rf_fpr, rf_tpr, rf_threshold = metrics.roc_curve(y, rf_preds)
rf_roc_auc = metrics.auc(rf_fpr, rf_tpr)

# method I: plt
rf_fig = plt.figure(figsize=(6,2))

plt.title('Receiver Operating Characteristic')
plt.plot(rf_fpr, rf_tpr, 'b', label = 'AUC = %0.2f' % rf_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')



In [None]:
logistic_reg = LogisticRegression(penalty='elasticnet', solver='saga',C=0.5, max_iter=1000)

pipeline_log = Pipeline([('transform', transform_pipeline)
                     , ('logistic_reg', logistic_reg)
                    ]
                   )

parameters_log = { 'logistic_reg__l1_ratio': [.1, .3, .8] }

logistic_pipeline = GridSearchCV(pipeline_log, parameters_log, cv=5, scoring='roc_auc', verbose=3, n_jobs=6)

logistic_pipeline.fit(X, y.values.ravel())

print(logistic_pipeline.best_params_)



In [None]:

log_probs = logistic_pipeline.predict_proba(X)
log_preds = log_probs[:,1]
log_fpr, log_tpr, log_threshold = metrics.roc_curve(y, log_preds)
log_roc_auc = metrics.auc(log_fpr, log_tpr)

# method I: plt
log_fig = plt.figure(figsize=(6,2))

plt.title('Receiver Operating Characteristic')
plt.plot(log_fpr, log_tpr, 'b', label = 'AUC = %0.2f' % log_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')



In [None]:

svc_model = SVC(gamma='auto', probability=True)

pipeline_svc = Pipeline([('transform', transform_pipeline)
                     , ('svc_model', svc_model)
                    ]
                   )

parameters_svc = { "svc_model__C": [.1, .3] }

svc_pipeline = GridSearchCV(pipeline_svc, parameters_svc, cv=5, scoring='roc_auc', verbose=3, n_jobs=6)

svc_pipeline.fit(X, y.values.ravel())

print(svc_pipeline.best_params_)



In [None]:

svc_probs = svc_pipeline.predict_proba(X)
svc_preds = svc_probs[:,1]
svc_fpr, svc_tpr, svc_threshold = metrics.roc_curve(y, svc_preds)
svc_roc_auc = metrics.auc(svc_fpr, svc_tpr)

svc_fig = plt.figure(figsize=(6,2))
plt.title('Receiver Operating Characteristic')
plt.plot(svc_fpr, svc_tpr, 'b', label = 'AUC = %0.2f' % svc_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
model_roc_dict = {'Random\nForest': rf_pipeline.best_score_
                  , 'Logistic\nRegression': logistic_pipeline.best_score_
                  , "Support Vector\nClassifier": svc_pipeline.best_score_
                 }

names = list(model_roc_dict.keys())
values = list(model_roc_dict.values())

fig = plt.figure(figsize=(6,2))

plt.bar(range(len(model_roc_dict)), values, tick_label=names)
plt.show()
