## Lending Dataset – Regularized Linear Regression

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn import metrics 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

### Step A: Read in Dataset and Display DataFrame Attributes

In [3]:
loan_data = pd.read_csv('loansDataClean.csv', index_col = 'ID')
loan_data.head()

Unnamed: 0_level_0,IntRate,LoanAmt,LoanTerm,Debt2Inc,Home__MORT,Home__OWN,Home__RENT,MonthlyInc,RevCredBal,FICO,EmpLen
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
81174,8.9,20000,0,14.9,1,0,0,6541.67,14272.0,737,0
99592,12.12,19200,0,28.36,1,0,0,4583.33,11140.0,717,2
80059,21.98,35000,1,23.81,1,0,0,11500.0,21977.0,692,2
33182,11.71,12000,0,18.78,0,0,1,3195.0,14469.0,697,9
62403,15.31,6000,0,20.05,0,1,0,4891.67,10391.0,672,3


### Step B: Linear Regression Analysis on Best Model

In [4]:
Y = loan_data [['IntRate']]
X = loan_data.drop(['IntRate'], axis=1)
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
X_std[:5]

array([[ 1.1276208 , -0.4745259 , -0.1056352 ,  1.14115464, -0.30263639,
        -0.9641138 ,  0.32047414,  0.02474645,  0.97751917, -1.53355396],
       [ 1.02010707, -0.4745259 ,  1.65831149,  1.14115464, -0.30263639,
        -0.9641138 , -0.29431579, -0.20378752,  0.37469848, -0.96624262],
       [ 3.14350321,  2.10736655,  1.06202936,  1.14115464, -0.30263639,
        -0.9641138 ,  1.87706358,  0.58696046, -0.37882739, -0.96624262],
       [ 0.05248351, -0.4745259 ,  0.40284275, -0.87630543, -0.30263639,
         1.03722195, -0.73016008,  0.03912104, -0.22812221,  1.0193471 ],
       [-0.75386945, -0.4745259 ,  0.56927754, -0.87630543,  3.30429529,
        -0.9641138 , -0.19751732, -0.25844012, -0.98164808, -0.68258694]])

In [5]:
X_std = sm.add_constant(X_std)

In [6]:
X_std0 = sm.add_constant(X_std)
loan_model0 = sm.OLS(Y, X_std0)
loan_model0 = loan_model0.fit()
loan_model0.summary()

0,1,2,3
Dep. Variable:,IntRate,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,593.5
Date:,"Sun, 20 Oct 2019",Prob (F-statistic):,0.0
Time:,16:34:35,Log-Likelihood:,-3771.3
No. Observations:,1752,AIC:,7563.0
Df Residuals:,1742,BIC:,7617.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.1208,0.050,262.957,0.000,13.023,13.219
x1,1.0979,0.062,17.676,0.000,0.976,1.220
x2,1.3223,0.054,24.460,0.000,1.216,1.428
x3,-0.1068,0.055,-1.930,0.054,-0.215,0.002
x4,-0.0873,0.031,-2.829,0.005,-0.148,-0.027
x5,0.0467,0.043,1.075,0.282,-0.038,0.132
x6,0.0607,0.030,2.035,0.042,0.002,0.119
x7,-0.0994,0.063,-1.568,0.117,-0.224,0.025
x8,-0.0919,0.060,-1.532,0.126,-0.209,0.026

0,1,2,3
Omnibus:,112.621,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,187.144
Skew:,0.492,Prob(JB):,2.3e-41
Kurtosis:,4.263,Cond. No.,6280000000000000.0


In [7]:
X_std1 = np.delete(X_std, [10], axis=1)
X_std1 = sm.add_constant(X_std1)
loan_model1 = sm.OLS(Y, X_std1)
loan_model1 = loan_model1.fit()
loan_model1.summary()

0,1,2,3
Dep. Variable:,IntRate,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,667.8
Date:,"Sun, 20 Oct 2019",Prob (F-statistic):,0.0
Time:,16:34:35,Log-Likelihood:,-3771.6
No. Observations:,1752,AIC:,7561.0
Df Residuals:,1743,BIC:,7610.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.1208,0.050,262.988,0.000,13.023,13.219
x1,1.1026,0.062,17.842,0.000,0.981,1.224
x2,1.3224,0.054,24.465,0.000,1.216,1.428
x3,-0.1075,0.055,-1.943,0.052,-0.216,0.001
x4,-0.0823,0.030,-2.728,0.006,-0.141,-0.023
x5,0.0460,0.043,1.059,0.290,-0.039,0.131
x6,0.0561,0.029,1.921,0.055,-0.001,0.113
x7,-0.1011,0.063,-1.596,0.111,-0.225,0.023
x8,-0.0890,0.060,-1.487,0.137,-0.206,0.028

0,1,2,3
Omnibus:,111.85,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,185.978
Skew:,0.489,Prob(JB):,4.12e-41
Kurtosis:,4.262,Cond. No.,5580000000000000.0


In [8]:
X_std2 = np.delete(X_std, [5,10], axis=1)
X_std2 = sm.add_constant(X_std2)
loan_model2 = sm.OLS(Y, X_std2)
loan_model2 = loan_model2.fit()
loan_model2.summary()

0,1,2,3
Dep. Variable:,IntRate,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,667.8
Date:,"Sun, 20 Oct 2019",Prob (F-statistic):,0.0
Time:,16:34:35,Log-Likelihood:,-3771.6
No. Observations:,1752,AIC:,7561.0
Df Residuals:,1743,BIC:,7610.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.1208,0.050,262.988,0.000,13.023,13.219
x1,1.1026,0.062,17.842,0.000,0.981,1.224
x2,1.3224,0.054,24.465,0.000,1.216,1.428
x3,-0.1075,0.055,-1.943,0.052,-0.216,0.001
x4,-0.1645,0.095,-1.738,0.082,-0.350,0.021
x5,-0.0267,0.093,-0.286,0.775,-0.210,0.157
x6,-0.1011,0.063,-1.596,0.111,-0.225,0.023
x7,-0.0890,0.060,-1.487,0.137,-0.206,0.028
x8,-3.1254,0.052,-60.451,0.000,-3.227,-3.024

0,1,2,3
Omnibus:,111.85,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,185.978
Skew:,0.489,Prob(JB):,4.12e-41
Kurtosis:,4.262,Cond. No.,3.93


In [9]:
X_std3 = np.delete(X_std, [5,6,10], axis=1)
X_std3 = sm.add_constant(X_std3)
loan_model3 = sm.OLS(Y, X_std3)
loan_model3 = loan_model3.fit()
loan_model3.summary()

0,1,2,3
Dep. Variable:,IntRate,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,763.6
Date:,"Sun, 20 Oct 2019",Prob (F-statistic):,0.0
Time:,16:34:35,Log-Likelihood:,-3771.6
No. Observations:,1752,AIC:,7559.0
Df Residuals:,1744,BIC:,7603.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.1208,0.050,263.057,0.000,13.023,13.219
x1,1.1026,0.062,17.846,0.000,0.981,1.224
x2,1.3223,0.054,24.470,0.000,1.216,1.428
x3,-0.1071,0.055,-1.937,0.053,-0.216,0.001
x4,-0.1419,0.052,-2.717,0.007,-0.244,-0.039
x5,-0.1015,0.063,-1.604,0.109,-0.226,0.023
x6,-0.0888,0.060,-1.484,0.138,-0.206,0.029
x7,-3.1247,0.052,-60.521,0.000,-3.226,-3.023

0,1,2,3
Omnibus:,111.73,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,185.466
Skew:,0.489,Prob(JB):,5.33e-41
Kurtosis:,4.259,Cond. No.,2.33


In [10]:
X_std4 = np.delete(X_std, [5,6,8,10], axis=1)
X_std4 = sm.add_constant(X_std4)
loan_model4 = sm.OLS(Y, X_std4)
loan_model4 = loan_model4.fit()
loan_model4.summary()

0,1,2,3
Dep. Variable:,IntRate,R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.753
Method:,Least Squares,F-statistic:,889.9
Date:,"Sun, 20 Oct 2019",Prob (F-statistic):,0.0
Time:,16:34:35,Log-Likelihood:,-3772.7
No. Observations:,1752,AIC:,7559.0
Df Residuals:,1745,BIC:,7598.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.1208,0.050,262.966,0.000,13.023,13.219
x1,1.0884,0.061,17.824,0.000,0.969,1.208
x2,1.3228,0.054,24.472,0.000,1.217,1.429
x3,-0.1315,0.053,-2.490,0.013,-0.235,-0.028
x4,-0.1498,0.052,-2.883,0.004,-0.252,-0.048
x5,-0.1362,0.059,-2.314,0.021,-0.252,-0.021
x6,-3.1242,0.052,-60.492,0.000,-3.226,-3.023

0,1,2,3
Omnibus:,109.817,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,181.065
Skew:,0.484,Prob(JB):,4.81e-40
Kurtosis:,4.242,Cond. No.,2.06


### Use sklearn LinearRegression to run linear regression

In [11]:
X_std = X_std4
scores_list = []
for rep in range(128):
    lr = LinearRegression()
    scores = cross_val_score(lr, X_std, Y, cv=8)
    scores_list.append(scores.mean())
print('Mean R-Squared Score:', sum (scores_list)/len (scores_list))

Mean R-Squared Score: 0.7499595880630956


#### random_state=33

In [12]:
lr = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X_std,Y,random_state=33,test_size=0.2)
model = lr.fit (X_train, Y_train)
Y_pred = lr.predict (X_test)
preds = Y_pred[:5]
preds

array([[15.64255519],
       [ 6.59548795],
       [13.82204776],
       [ 7.82483241],
       [13.8250073 ]])

In [13]:
Y_test[:5].values

array([[14.33],
       [ 5.79],
       [13.67],
       [ 8.9 ],
       [13.49]])

In [14]:
model.coef_

array([[ 0.        ,  1.08346743,  1.29230826, -0.0921673 , -0.15510927,
        -0.15126294, -3.12942754]])

In [15]:
model.intercept_

array([13.13233964])

In [16]:
print('R-Squared Score:', model.score (X_test, Y_test))

R-Squared Score: 0.7833961003750468


In [17]:
print('MSE:', metrics.mean_squared_error (Y_test, Y_pred))

MSE: 4.230840380528938


#### random_state=3

In [18]:
lr = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X_std,Y,random_state=3,test_size=0.2)
model = lr.fit (X_train, Y_train)
Y_pred = lr.predict (X_test)
preds = Y_pred[:5]
preds

array([[11.06552759],
       [13.77971589],
       [12.59468905],
       [ 6.80612659],
       [ 9.26459797]])

In [19]:
Y_test[:5].values

array([[ 8.49],
       [16.29],
       [14.09],
       [ 6.62],
       [ 7.88]])

In [20]:
model.coef_

array([[ 0.        ,  1.06963748,  1.2823738 , -0.13660673, -0.10849553,
        -0.13872206, -3.16959378]])

In [21]:
model.intercept_

array([13.11982758])

In [22]:
print('R-Squared Score:', model.score (X_test, Y_test))

R-Squared Score: 0.7489953851271478


In [23]:
print('MSE:', metrics.mean_squared_error (Y_test, Y_pred))

MSE: 4.680107072861487


#### random_state=13

In [24]:
lr = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X_std,Y,random_state=13,test_size=0.2)
model = lr.fit (X_train, Y_train)
Y_pred = lr.predict (X_test)
preds = Y_pred[:5]
preds

array([[13.64000738],
       [14.45046533],
       [12.04869505],
       [13.85777977],
       [15.35512902]])

In [25]:
Y_test[:5].values

array([[14.09],
       [17.77],
       [11.49],
       [14.09],
       [17.77]])

In [26]:
model.coef_

array([[ 0.        ,  1.04677445,  1.3520537 , -0.17063715, -0.13697887,
        -0.1260454 , -3.17444375]])

In [27]:
model.intercept_

array([13.13996531])

In [28]:
print('R-Squared Score:', model.score (X_test, Y_test))

R-Squared Score: 0.7465918616275109


In [29]:
print('MSE:', metrics.mean_squared_error (Y_test, Y_pred))

MSE: 4.226985637985071


#### random_state=23

In [30]:
lr = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X_std,Y,random_state=23,test_size=0.2)
model = lr.fit (X_train, Y_train)
Y_pred = lr.predict (X_test)
preds = Y_pred[:5]
preds

array([[13.73591382],
       [15.07196615],
       [15.18774707],
       [10.45677232],
       [15.09663893]])

In [31]:
Y_test[:5].values

array([[16.29],
       [13.67],
       [17.77],
       [ 7.9 ],
       [14.61]])

In [32]:
model.coef_

array([[ 0.        ,  1.12237293,  1.3481246 , -0.16711445, -0.15438311,
        -0.21764075, -3.19576666]])

In [33]:
model.intercept_

array([13.14614919])

In [34]:
print('R-Squared Score:', model.score (X_test, Y_test))

R-Squared Score: 0.713570896220894


In [35]:
print('MSE:', metrics.mean_squared_error (Y_test, Y_pred))

MSE: 4.587845716372342


### Step C: Ridge Regression Analysis on Best Model, alpha = 0

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(X_std, Y, random_state=33, test_size=0.2)
ridgemod = Ridge (alpha=0, normalize=True)
ridgemod.fit (X_train, Y_train)
Y_pred = ridgemod.predict (X_test) 

In [37]:
preds = Y_pred[:5]
preds

array([[15.64255519],
       [ 6.59548795],
       [13.82204776],
       [ 7.82483241],
       [13.8250073 ]])

In [38]:
acts = Y_test[:5].values
acts

array([[14.33],
       [ 5.79],
       [13.67],
       [ 8.9 ],
       [13.49]])

In [39]:
ridgemod.coef_

array([[ 0.        ,  1.08346743,  1.29230826, -0.0921673 , -0.15510927,
        -0.15126294, -3.12942754]])

In [40]:
ridgemod.intercept_

array([13.13233964])

In [41]:
print ('R-Squared Score:', ridgemod.score (X_test, Y_test))

R-Squared Score: 0.7833961003750467


In [42]:
print ("Ridge alpha=0 MSE", metrics.mean_squared_error (Y_test, Y_pred))

Ridge alpha=0 MSE 4.230840380528939


### Step D: Ridge Regression on Best Model using Alpha Range

In [43]:
alpha_range = [0.01, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 
               5, 7.5, 10, 25, 50, 75, 100]
print ("RidgeCV alpha array", alpha_range)   


RidgeCV alpha array [0.01, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10, 25, 50, 75, 100]


In [44]:
ridcvmod = RidgeCV (alphas=alpha_range, 
                    scoring='neg_mean_squared_error')
ridcvmod.fit (X_train, Y_train)
Y_pred = ridcvmod.predict (X_test)
Y_pred[:5]

array([[15.62927581],
       [ 6.62000622],
       [13.8216243 ],
       [ 7.83973335],
       [13.82528628]])

In [45]:
print("RidgeCV Regression Best Alpha", ridcvmod.alpha_)

RidgeCV Regression Best Alpha 5.0


In [46]:
print("RidgeCV Regression Coefficients", ridcvmod.coef_) 

RidgeCV Regression Coefficients [[ 0.          1.07866166  1.2893413  -0.08875515 -0.15597031 -0.14924822
  -3.11709172]]


In [47]:
print ("RidgeCV Regression Test R-squared", ridcvmod.score (X_test, Y_test))

RidgeCV Regression Test R-squared 0.7833363777933967


In [48]:
print ("RidgeCV Regression MSE", metrics.mean_squared_error (Y_pred, Y_test)) 

RidgeCV Regression MSE 4.232006918668424


### Step E: Lasso Regression on Best Model 

In [49]:
Y = loan_data [['IntRate']]
X = loan_data.drop(['IntRate','EmpLen','Home__OWN',
                    'Home__RENT','RevCredBal'], axis=1)
Y = Y.values
Y = Y.ravel()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33,test_size=0.2)

In [50]:
lascvmod = LassoCV (n_alphas=100, normalize=True,
                    random_state=33)
lascvmod.fit(X_train, Y_train)
Y_pred = lascvmod.predict(X_test)
Y_pred[:5]



array([15.54853096,  6.82117272, 13.82631855,  7.73871915, 13.92975875])

In [51]:
print("LassoCV Regression Best Alpha", lascvmod.alpha_)

LassoCV Regression Best Alpha 0.0012330526262094902


In [52]:
print("LassoCV Regression Coefficients", lascvmod.coef_) 

LassoCV Regression Coefficients [ 1.33554702e-04  3.28515373e+00 -1.76193619e-03 -2.53930698e-01
 -2.11660218e-05 -9.26951262e-02]


In [53]:
print ("LassoCV Regression Test R-squared", lascvmod.score (X_test, Y_test))

LassoCV Regression Test R-squared 0.7815793804623784


In [54]:
print ("LassoCV Regression MSE", metrics.mean_squared_error (Y_pred, Y_test))

LassoCV Regression MSE 4.266325669482356


### Step F: Lasso Regression on Full Model

In [55]:
Y = loan_data [['IntRate']]
X = loan_data.drop(['IntRate'], axis=1)
Y = Y.values
Y = Y.ravel()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33,test_size=0.2)


In [56]:
lascvmod = LassoCV (n_alphas=100, normalize=True,
                    random_state=33)
lascvmod.fit(X_train, Y_train)
Y_pred = lascvmod.predict(X_test)
Y_pred[:5]




array([15.51382685,  6.89377613, 13.77691187,  7.7324606 , 13.97114691])

In [57]:
print("LassoCV Regression Best Alpha", lascvmod.alpha_)

LassoCV Regression Best Alpha 0.001747820239180997


In [58]:
print("LassoCV Regression Coefficients", lascvmod.coef_) 

LassoCV Regression Coefficients [ 1.29658624e-04  3.26121752e+00 -0.00000000e+00 -2.19507287e-01
  9.12452316e-03  0.00000000e+00 -9.04833708e-06 -1.72881333e-06
 -9.21549379e-02  0.00000000e+00]


In [59]:
print ("LassoCV Regression Test R-squared", lascvmod.score (X_test, Y_test))

LassoCV Regression Test R-squared 0.781260836355689


In [60]:
print ("LassoCV Regression MSE", metrics.mean_squared_error (Y_pred, Y_test))

LassoCV Regression MSE 4.272547668587149
