# Technical notebook

### Lending Club Data Set

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# from loan_helper import data_cleaning
from loan_helper import data_converting
from loan_helper import column_description

#SMOTE
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

In [None]:
!ls -lath LendingClub

- LendingClub data source:

https://www.lendingclub.com/info/download-data.action

In [2]:
#reading excel
description = pd.read_excel('LendingClub/LCDataDictionary.xlsx')

In [4]:
#reading Lending Club loan data from 2014
# data_lc = pd.read_csv('LendingClub/LoanStats3c_securev1.csv', low_memory=False, header=1)

In [3]:
#read in Lending Club loan data from zip file
from zipfile import ZipFile
zip_file = ZipFile('LendingClub/LoanStats3c_securev1.csv.zip')
data_lc = pd.read_csv(zip_file.open('LoanStats3c_securev1.csv'), low_memory=False, header=1)

In [4]:
data_lc.loan_amnt.isna().sum()

2

In [5]:
#removed two rows with full NAN values
data_lc = data_lc.loc[data_lc.loan_amnt.notnull()]
data_lc.shape

(235629, 150)

## Feature selection and feature engineering

### Understanding the columns

In order to understand the columns we created a dataframe with column names, two examples, datatype, number of missing values, and the long description. The dataframe was exported to excel to make decision on columns. The result is stored in col_selection.xlsx.

In [6]:
desc = column_description(data_lc, description)

In [7]:
pd.set_option('display.max_colwidth', -1) #this allows us to see the very long description, if exceeds 50 char
desc.head(2)

Unnamed: 0,col_name,exmp1,exmp2,dtype,Description,nan_counts
0,id,37662224.0,36804663.0,object,A unique LC assigned ID for the loan listing.,0
1,member_id,,,float64,A unique LC assigned Id for the borrower member.,235629


In [8]:
desc.to_excel('col_desc_2014.xlsx')

### First round feauture selection

Originally the dataset contained 150 columns. When we were reducing the features in order to avoid the overfitting the model we selected features according to the following:

- Discarded columns that contained payment or collection information (47 columns)
- Discarded columns that contained information that were not available at the time of credit application
- Discarded features that require too much data processing (typically free input i.e. emp_title)
- Discarded redundant features (subgrade - grade, title - purpose)
- Discarded feauters that contain too much NAN values (mnths_since_last_delinq, mths_since_recent_bc_dlq, mths_since_recent_revol_delinq)

In [9]:
col_selection = pd.read_excel('col_selection_2014.xlsx')

In [10]:
col_selection.head(3)

Unnamed: 0.1,Unnamed: 0,col_name,exmp1,exmp2,dtype,Description,nan_counts,Decision,Cause
0,0,id,37662224.0,36804663.0,object,A unique LC assigned ID for the loan listing.,0,remove,irrelevant
1,1,member_id,,,float64,A unique LC assigned Id for the borrower member.,235629,remove,irrelevant
2,2,loan_amnt,7650.0,23325.0,float64,"The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.",0,keep,


In [11]:
selected_col = col_selection.loc[col_selection.Decision == 'keep', 'col_name'].to_list()
len(selected_col)

59

In [12]:
dataset = data_lc.loc[:, selected_col]

In [13]:
dataset.shape

(235629, 59)

### Converting data types

- emp_length column was converted to numeric
- earliest credit line: convert date to numeric (years)
- revol_util (revolving utilization) convert to numeric
- creating regions from state
- reduce categories of loan purpose

In [14]:
dataset = data_converting(dataset)

AttributeError: 'DataFrame' object has no attribute 'mths_since_recent_bc_dlq'

In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235629 entries, 0 to 235628
Data columns (total 59 columns):
loan_amnt                     235629 non-null float64
term                          235629 non-null object
installment                   235629 non-null float64
grade                         235629 non-null int64
emp_length                    235629 non-null float64
home_ownership                235629 non-null object
annual_inc                    235629 non-null float64
verification_status           235629 non-null object
loan_status                   235629 non-null object
purpose                       235629 non-null object
dti                           235629 non-null float64
delinq_2yrs                   235629 non-null float64
earliest_cr_line              235629 non-null int64
fico_range_low                235629 non-null float64
inq_last_6mths                235629 non-null float64
open_acc                      235629 non-null float64
pub_rec                       2356

### Determing the target feature

In [None]:
dataset.loan_status.value_counts()

#### Meaning of the categories
<b>Fully paid:</b> Loan has been fully repaid, either at the expiration of the 3- or 5-year year term or as a result of a prepayment.

<b>Current:</b> Loan is up to date on all outstanding payments. 

<b>In Grace Period:</b> Loan is past due but within the 15-day grace period. 

<b>Late (16-30):</b> Loan has not been current for 16 to 30 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.

<b>Late (31-120):</b> Loan has not been current for 31 to 120 days. Learn more about the tools LendingClub has to deal with delinquent borrowers.

<b>Default:</b> Loan has not been current for an extended period of time. Learn more about the difference between “default” and “charge off”.

<b>Charged Off:</b> Loan for which there is no longer a reasonable expectation of further payments. Upon Charge Off, the remaining principal balance of the Note is deducted from the account balance. Learn more about the difference between “default” and “charge off”.

Sosurce: https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

In [None]:
pd.crosstab(columns=dataset['loan_status'], index=dataset['term'],)

We decided to ignore the 'gray' categories, where there might chance to the recovery of the loan. The 'Current' category contains the 60 months term loans, removing them would panalize the long term loans by increasing the default rate within this category. 

In [None]:
#Select default categories:
dataset.loc[dataset.loan_status == 'Fully Paid', 'default'] = 0
dataset.loc[dataset.loan_status == 'Charged Off', 'default'] = 1
dataset.loc[dataset.loan_status == 'Current', 'default'] = 0

dataset = dataset.loc[dataset.default.notnull()]

In [None]:
#remove loan_status, default replace it
dataset = dataset.drop(columns='loan_status')

In [None]:
dataset = dataset.reset_index()
dataset = dataset.drop(columns='index')

In [None]:
plt.figure(figsize=(6,5))
plt.bar(x=['non default', 'default'], height=dataset.default.value_counts()/len(dataset), width=0.6,)
plt.title('The distribution of defaulted and non defaulted loans\n')
ax=plt.gca();

In [None]:
# dataset.head()

In [None]:
x_feats = dataset.columns.to_list()
x_feats.remove('default')

### Multicollinearity examination

In [None]:
x_feats_cont =  x_feats
x_feats_cont.remove('term')
x_feats_cont.remove('home_ownership')
x_feats_cont.remove('verification_status')
x_feats_cont.remove('purpose')
x_feats_cont.remove('region')

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X = dataset[x_feats_cont]
imp_vif = SimpleImputer(strategy='median', copy=True, fill_value=None)
imp_vif.fit(X)  
X = imp_vif.transform(X)

In [None]:
vif = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
list(zip(x_feats_cont, vif))

In [None]:
x_feats = ['revol_util','revol_bal','fico_range_low','grade','installment','loan_amnt','emp_length','annual_inc','delinq_2yrs','dti','delinq_2yrs','inq_last_6mths',
           'pub_rec','collections_12_mths_ex_med', 'tot_coll_amt', 'total_rev_hi_lim','acc_open_past_24mths',
           'avg_cur_bal','chargeoff_within_12_mths','delinq_amnt', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
          'mths_since_recent_bc', 'mths_since_recent_inq','num_accts_ever_120_pd','num_tl_120dpd_2m',
          'num_tl_30dpd','num_tl_90g_dpd_24m', 'num_tl_op_past_12m','percent_bc_gt_75', 'pub_rec_bankruptcies',
          'tax_liens']
x_feats += ['term','home_ownership','verification_status','purpose','region']

In [None]:
x_feats

In [None]:
x_feats.remove('level_0')

## Preparing dataset for modeling

In [None]:
X = pd.get_dummies(dataset[x_feats], drop_first=True)

In [None]:
y = dataset.default

In [None]:
y.value_counts()

In [None]:
X.head()

In [None]:
X.shape

In [None]:
X.columns

### Train-Test Split

In [None]:
#'Stratify=y' provide us the same ratio in the target variable then it was in the original dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=y) #25%

In [None]:
y_train.mean()

In [None]:
y_test.mean()

### Scaling

In [None]:
X_train_norm = scale(X_train, axis = 0) 
X_test_norm = scale(X_test, axis = 0)

The following scaling method assures that the variables of X_train are within a 0-1 range

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Scaling using StandarScaler

- For SMOTE

In [None]:
scaler = StandardScaler()
scaler.fit(X_train_imp)
X_train_scaled = scaler.transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

The following scaling method robost to outliers

In [None]:
scaler = RobustScaler()
scaler.fit(X_train_imp)
X_train_scaled = scaler.transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

## Objective: Increasing predictibility of loan defaults from actual default 

### Maximize the F1 score

In [None]:
def get_metric(y_train, y_probability):
    '''
    inputs: y_train values and from the trained model the y probabilities for default
    output: maximized F1 score, cut-off and the corresponding y_hat
    '''
    y = 0
    F1_score = 0
    cut_off = 0
    for cutoff in np.linspace(0,1,101):
        y_hat = (y_probability > cutoff) * 1
        f1 = f1_score(y_train, y_hat)
        if f1> F1_score:
            F1_score = f1
            cut_off = cutoff
            y = y_hat
    
    print('Recall:', recall_score(y_train, y))
    print('Precision:', precision_score(y_train, y))
    print('F1_score:', F1_score)
    print('Cut_off:', cut_off)
    
    conf_matrix = pd.DataFrame(confusion_matrix(y_train, y),
                                    index=['actual 0', 'actual 1'],
                                    columns=['predicted 0', 'predicted 1'])
    return conf_matrix

 ## Model Selection - Logistic regression

### (I) Baseline: vanilla logistic regression w/o imbalance strategy

In [None]:
# Vanilla regression
logreg_vanilla = LogisticRegression(C=1e9, solver='liblinear', max_iter=200)

model_vanilla = logreg_vanilla.fit(X_train_scaled, y_train)

In [None]:
y_probability = model_vanilla.predict_proba(X_train_scaled)[:,1]

In [None]:
get_metric(y_train, y_probability)

In [None]:
p, r, t = precision_recall_curve(y_train, model_vanilla.decision_function(X_train_scaled))

In [None]:
from inspect import signature
step_kwargs = ({'step': 'post'} if 'step' in signature(plt.fill_between).parameters else {})
plt.step(r, p, color='b', alpha=0.4, where='post')
plt.fill_between(r, p, color='b', alpha=0.4, **step_kwargs)
plt.xlabel('precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1])
plt.title('Precision-Recall curve');

### (II) Lasso regression with different C values (w/o imbalance strategy)

In [None]:
C_values = [0.01, 0.1, 1, 10, 100, 1000, 10000]  # low value means high l1 penalty on coefficients

for C in C_values:
    logreg_l1 = LogisticRegression(C=C, penalty='l1',
                                   solver='liblinear',
                                   max_iter=200)
    print('-'*40,f'\nLasso regression with C = {C}')
    model_l1 = logreg_l1.fit(X_train_scaled, y_train)
    y_probability = model_l1.predict_proba(X_train_scaled)[:,1]
    get_metric(y_train, y_probability)


### (III) Ridge regression with different C values (w/o imbalance strategy)

In [None]:
C_values = [0.01, 0.1, 1, 10, 100, 1000, 10000]  # low value means high l1 penalty on coefficients

for C in C_values:
    logreg_l2 = LogisticRegression(C=C, penalty='l2',
                                   solver='newton-cg',
                                   max_iter=200)
    
    print('-'*40,f'\nRidge regression with C = {C}')
    model_l2 = logreg_l2.fit(X_train_scaled, y_train)
    y_probability = model_l2.predict_proba(X_train_scaled)[:,1]
    get_metric(y_train, y_probability)

### (IV) Cross-Validation (w/o imbalance strategy)

In [None]:
cv = StratifiedKFold(n_splits= 5, random_state=1000, shuffle=True)


#### Vanilla

In [None]:
lr_vanilla = LogisticRegression(C=1e9,
                                solver='newton-cg',
                                max_iter=200)


cv_vanilla = cross_validate(estimator=lr_vanilla,
                            X=X_train_scaled, y=y_train,
                            cv=cv,
                            n_jobs=-1,
                            return_train_score=True)

In [None]:
y_probability = cross_val_predict(lr_vanilla, X_train_scaled, y_train, cv=cv, method='predict_proba')[:,1]
get_metric(y_train, y_probability)

#### Ridge

In [None]:
l2_reg = LogisticRegression(C=1,
                            solver='newton-cg',
                            penalty='l2',
                            max_iter=200)

cv_l2 = cross_validate(estimator=l2_reg, X=X_train_scaled, y=y_train,
                       cv=cv,
                       n_jobs=-1,
                       return_estimator=True,
                       return_train_score=True)

In [None]:
y_probability = cross_val_predict(l2_reg, X_train_scaled, y_train, cv=cv, method='predict_proba')[:,1]
get_metric(y_train, y_probability)

#### Lasso

In [None]:
l1_reg = LogisticRegression(C=1,
                            solver='saga',
                            penalty='l1',
                            max_iter=200)
cv_l1 = cross_validate(estimator=l1_reg, X=X_train_scaled, y=y_train,
                       cv=cv,
                       n_jobs=-1,
                       return_estimator=True,
                       return_train_score=True)

In [None]:
y_probability = cross_val_predict(l1_reg, X_train_scaled, y_train, cv=cv, method='predict_proba')[:,1]
get_metric(y_train, y_probability)

## (V) Imbalance Strategy: Random Oversampling

## (VI) IMBALANCE STRATEGY: SMOTE

In [None]:
print(y.value_counts()) #Previous original class distribution
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_sample(X_train_scaled, y_train) 
print(pd.Series(y_train_smote).value_counts()) #Preview synthetic sample class distributi

#### A) Baseline Vanilla

In [None]:
# Vanilla regression
logreg_vanilla = LogisticRegression(C=1e9, solver='liblinear', max_iter=200)

model_vanilla = logreg_vanilla.fit(X_train_smote, y_train_smote)

In [None]:
y_probability = model_vanilla.predict_proba(X_train_scaled)[:,1]

get_metric(y_train, y_probability)

In [None]:
y_probability = model_vanilla_balance.predict_proba(X_test_scaled)[:,1]

get_metric(y_test, y_probability)

- Visualization

In [None]:
p_balance, r_balance, t_balance = precision_recall_curve(y_train, model_vanilla_balance.decision_function(X_train_scaled))

In [None]:
fig, axs = plt.subplots(1,2, figsize=(13,6))

step_kwargs = ({'step': 'post'} if 'step' in signature(plt.fill_between).parameters else {})


axs[0].fill_between(r, p, color='#8c8c8c', alpha=0.4, **step_kwargs)
axs[0].set(title='Imbalance Precision-Recall Curve', xlabel='Recall', ylabel='Precision', xlim=(0.0, 1), ylim=(0.0, 1.05))

axs[1].fill_between(r_balance, p_balance, color='r', alpha=0.4, **step_kwargs)
axs[1].set(title='Balanced Precision-Recall Curve', xlabel='Recall', ylabel='Precision', xlim=(0.0, 1), ylim=(0.0, 1.05))
# fig.savefig('Precision-recall curve')

plt.show()

####  B) Lasso regression with different C values

In [None]:
C_values = [0.01]  # low value means high l1 penalty on coefficients

for C in C_values:
    logreg_l1 = LogisticRegression(C=C, penalty='l1',
                                   solver='liblinear',
                                   max_iter=200)
    print('-'*40,f'\nLasso regression with C = {C}')
    model_l1 = logreg_l1.fit(X_train_smote, y_train_smote)
    y_probability = model_l1.predict_proba(X_train_scaled)[:,1]
    get_metric(y_train, y_probability)


#### C) Ridge regression with different C values 

In [None]:
C_values = [0.01]  # low value means high l1 penalty on coefficients

for C in C_values:
    logreg_l2 = LogisticRegression(C=C, penalty='l2',
                                   solver='newton-cg',
                                   max_iter=200)
    
    print('-'*40,f'\nRidge regression with C = {C}')
    model_l2 = logreg_l2.fit(X_train_smote, y_train_smote)
    y_probability = model_l2.predict_proba(X_train_scaled)[:,1]
    get_metric(y_train, y_probability)

#### D) Cross-Validation

In [None]:
cv = StratifiedKFold(n_splits= 5, random_state=1000, shuffle=True)


#### Vanilla

In [None]:
lr_vanilla = LogisticRegression(C=1e9,
                                solver='newton-cg',
                                max_iter=200)


cv_vanilla = cross_validate(estimator=lr_vanilla,
                            X=X_train_smote, y=y_train_smote,
                            cv=cv,
                            n_jobs=-1,
                            return_train_score=True)

In [None]:
y_probability = cross_val_predict(lr_vanilla, X_train_scaled, y_train, cv=cv, method='predict_proba')[:,1]
get_metric(y_train, y_probability)

#### Ridge

In [None]:
l2_reg = LogisticRegression(C=1,
                            solver='newton-cg',
                            penalty='l2',
                            max_iter=200)

cv_l2 = cross_validate(estimator=l2_reg, X=X_train_smote, y=y_train_smote,
                       cv=cv,
                       n_jobs=-1,
                       return_estimator=True,
                       return_train_score=True)

In [None]:
y_probability = cross_val_predict(l2_reg, X_train_scaled, y_train, cv=cv, method='predict_proba')[:,1]
get_metric(y_train, y_probability)

#### Lasso

In [None]:
l1_reg = LogisticRegression(C=1,
                            solver='saga',
                            penalty='l1',
                            max_iter=200)
cv_l1 = cross_validate(estimator=l1_reg, X=X_train_smote, y=y_train_smote,
                       cv=cv,
                       n_jobs=-1,
                       return_estimator=True,
                       return_train_score=True)

In [None]:
y_probability = cross_val_predict(l1_reg, X_train_scaled, y_train, cv=cv, method='predict_proba')[:,1]
get_metric(y_train, y_probability)