## Import modules

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn import preprocessing, linear_model


## Data loading

In [2]:
random_state=809
# Load data from sources
data=pd.read_csv('./Lending Club/Lending_Club_Data.csv')

## Data description


## Data preprocessing

In [17]:
# Data cleaning and processing
data = data.fillna(0) # Fill na values with 0
for i in data.columns:  # Transform non-numeric types to string
    if data[i].dtypes == 'object':
        data[i] = data[i].astype('str')
# Enumerating categorical data
encoder = preprocessing.OrdinalEncoder()
encoder.fit(data)
data_encoded = encoder.transform(data)
# Dataframe creation
df = pd.DataFrame(data_encoded,columns=data.columns)
df = df.drop(["dti", "dti_joint", "emp_title", "dti_joint.1", "loan_id", "total_pymnt", "total_rec_int", "total_rec_late_fee", "total_rec_prncp", "recoveries"], axis=1)
variables = list(df.columns)
variables.remove('loan_status')
df.head(10)

Unnamed: 0,loan_status,loan_amnt,term,int_rate,sub_grade,installment,issue_d,annual_inc,emp_length,verification_status,...,revol_util,tax_liens,tot_cur_bal,tot_hi_cred_lim,total_acc,total_bal_ex_mort,total_bc_limit,revol_bal_joint,annual_inc_joint,verification_status_joint
0,1.0,1320.0,1.0,148.0,15.0,53744.0,0.0,38792.0,3.0,2.0,...,1083.0,0.0,333044.0,352334.0,29.0,144867.0,1008.0,0.0,0.0,0.0
1,0.0,760.0,1.0,112.0,12.0,30020.0,0.0,16885.0,3.0,2.0,...,924.0,0.0,125061.0,262323.0,29.0,30574.0,875.0,0.0,0.0,0.0
2,0.0,560.0,1.0,179.0,19.0,24962.0,0.0,27275.0,1.0,0.0,...,1188.0,0.0,13020.0,3148.0,5.0,13061.0,744.0,0.0,0.0,0.0
3,0.0,408.0,0.0,90.0,10.0,23288.0,0.0,32096.0,2.0,1.0,...,713.0,0.0,89648.0,78688.0,23.0,91867.0,435.0,0.0,0.0,0.0
4,0.0,623.0,0.0,69.0,8.0,35716.0,0.0,13341.0,11.0,0.0,...,325.0,0.0,15830.0,68466.0,13.0,15871.0,4067.0,0.0,0.0,0.0
5,0.0,1160.0,1.0,174.0,18.0,50840.0,0.0,22400.0,7.0,2.0,...,455.0,0.0,9677.0,15605.0,7.0,9717.0,1424.0,0.0,0.0,0.0
6,0.0,560.0,0.0,169.0,17.0,35958.0,0.0,33120.0,0.0,0.0,...,536.0,0.0,90505.0,110099.0,21.0,92790.0,232.0,0.0,0.0,0.0
7,0.0,560.0,0.0,1.0,0.0,29280.0,0.0,29364.0,2.0,0.0,...,689.0,0.0,109098.0,113261.0,22.0,112214.0,1721.0,0.0,0.0,0.0
8,0.0,361.0,0.0,256.0,33.0,26810.0,0.0,25396.0,11.0,0.0,...,838.0,0.0,40488.0,33881.0,7.0,40537.0,477.0,0.0,0.0,0.0
9,0.0,80.0,0.0,158.0,16.0,4003.0,0.0,3618.0,4.0,2.0,...,841.0,0.0,7003.0,1999.0,11.0,7043.0,134.0,0.0,0.0,0.0


## Create training and test sets

In [18]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['loan_status'], axis=1),
                                                    df['loan_status'], test_size=0.33,
                                                    random_state=random_state)

## LASSO and OLS regression


### Data standardization

In [19]:
scaler = StandardScaler()
X_std = pd.DataFrame(scaler.fit_transform(X_train), columns =X_train.columns)


### Alpha range definition


In [20]:
# Define alpha restriction to perform model selection
n_alphas = 1000
alphas = np.logspace(4.5, -5, n_alphas) # Generates n_alphas in log scale
alphas = alphas.tolist()

### Alpha selection


In [22]:
model = LassoCV(cv=10, verbose=True)
model.fit(X_std, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

LassoCV(cv=10, verbose=True)

Optimal alpha

In [23]:
f"{model.alpha_:.20f}"



'0.00013166397443272189'

Optimal model description

In [None]:
optimal_lasso = Lasso(alpha=model.alpha_)
optimal_lasso.fit(X_train, y_train)

OLS regression benchmark

In [15]:
OLS = linear_model.LinearRegression()
OLS.fit(X_train, y_train)

LinearRegression()

In [16]:
list_tuples = list(zip(variables, np.round(optimal_lasso.coef_, 5), np.round(OLS.coef_, 5)))
lasso_df = pd.DataFrame(list_tuples, columns =['Predictor', 'Lasso COEF', 'OLS coef'])

Lasso omitted predictors

In [252]:
# Omitted
lasso_df[lasso_df['Lasso COEF'] == 0]

Unnamed: 0,Predictor,Lasso COEF,OLS coef
0,recoveries,-0.0,-0.0
1,total_pymnt,0.0,0.0
2,total_rec_int,-0.0,-0.0
4,total_rec_prncp,0.0,0.0
11,annual_inc,0.0,0.0
17,zip_code,0.0,0.0
18,avg_cur_bal,-0.0,-0.0
19,delinq_2yrs,-0.0,0.00012
21,earliest_cr_line,0.0,0.0
25,num_accts_ever_120_pd,-0.0,8e-05


In [253]:
# Selected
lasso_df[lasso_df['Lasso COEF'] != 0]




Unnamed: 0,Predictor,Lasso COEF,OLS coef
3,total_rec_late_fee,-2e-05,-2e-05
5,loan_amnt,0.00053,0.00054
6,term,-0.23089,-0.23769
7,int_rate,0.0014,0.00143
8,sub_grade,-0.00117,-0.00122
9,installment,-2e-05,-2e-05
10,issue_d,9e-05,9e-05
12,emp_length,0.00092,0.00093
13,verification_status,-0.00071,-0.00157
14,purpose,0.00488,0.00485


In [254]:
#Lasso
print('R squared training set', round(optimal_lasso.score(X_train, y_train)*100, 2))
print('R squared test set', round(optimal_lasso.score(X_test, y_test)*100, 2))
# OLS
print('R squared training set', round(OLS.score(X_train, y_train)*100, 2))
print('R squared test set', round(OLS.score(X_test, y_test)*100, 2))



R squared training set 72.75
R squared test set 72.74
R squared training set 72.75
R squared test set 72.74
