In [1]:
import pandas as pd 
import numpy as np 
import scipy as sp 
import mglearn 
import sklearn 
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# import boston housing data set 
from sklearn.datasets import load_boston 
boston = load_boston()

In [5]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [6]:
# check the data dimension
boston.data.shape 

(506, 13)

In [14]:
# check the features 
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [9]:
# check feature data 
boston.data[0:3]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00]])

In [11]:
# check target data 
boston.target[0:3]

array([24. , 21.6, 34.7])

In [12]:
# split the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=16)

In [15]:
# build linear regression model 
from sklearn.linear_model import LinearRegression
lr1 = LinearRegression()
lr1.fit(X_train, y_train)

LinearRegression()

In [17]:
# view r-sq for training set and testing set 
print('training set r-sq: {}'.format(lr1.score(X_train, y_train)))
print('testing set r-sq: {}'.format(lr1.score(X_test, y_test)))

training set r-sq: 0.7458872502342924
testing set r-sq: 0.674989394981633


# Similar r-sq for training set and testing set suggesting no overfitting 

In [18]:
# load boston data with interaction of 2 features included 
boston_int_X, boston_int_y = mglearn.datasets.load_extended_boston()

In [22]:
boston_int_X.shape # 13 single feature + all combo of any 2 features interaction (13+12+...1=91)

(506, 104)

In [23]:
# split the data for training and testing 
X2_train, X2_test, y2_train, y2_test = train_test_split(boston_int_X, boston_int_y, random_state=26)

In [24]:
lr2 = LinearRegression()
lr2.fit(X2_train, y2_train)

LinearRegression()

In [25]:
# checking r-sq for training set and testing set 
print('training set r-sq: {}'.format(lr2.score(X2_train, y2_train)))
print('testing set r-sq: {}'.format(lr2.score(X2_test, y2_test)))

training set r-sq: 0.9417850993754255
testing set r-sq: 0.6270418649693665


# big difference btw training and testing indicating overfitting 

In [27]:
# apply ridge regression to boston data set with no feature interaction
from sklearn.linear_model import Ridge
ridge1 = Ridge().fit(X_train, y_train)

In [28]:
# r-sq for training and testing sets for 
print('traing set r-sq: {}'.format(ridge1.score(X_train, y_train)))
print('testing set r-sq: {}'.format(ridge1.score(X_test, y_test)))

traing set r-sq: 0.7441353466870264
testing set r-sq: 0.6659051448391575


# no change compare to linear regression model 

In [29]:
# apply ridge regression to boston data set with 2 feature interaction 
ridge2 = Ridge().fit(X2_train, y2_train)

In [31]:
print('traing set r-sq: {:.2f}'.format(ridge2.score(X2_train, y2_train)))
print('testing set r-sq: {:.2f}'.format(ridge2.score(X2_test, y2_test)))

traing set r-sq: 0.86
testing set r-sq: 0.85


In [33]:
# check number of features used in the model(coef!=0)
print('number of features used: {}'.format(np.sum(ridge2.coef_ != 0)))

number of features used: 104


In [42]:
# the default setting use alpha = 1. We can try a bigger alpha to increase generalization/reduce model complexity
ridge10 = Ridge(alpha=10).fit(X2_train, y2_train)

In [43]:
# check r_sq for training and testing set 
print(ridge10.score(X2_train, y2_train))
print(ridge10.score(X2_test, y2_test))

0.7579145914920675
0.7426775662712821


# Higher alpha results in the lower r_sq values in both training and testing set, suggesting underfitting 

In [35]:
# apply lasso regression to boston data set with origianl 10 features
from sklearn.linear_model import Lasso
lasso1 = Lasso().fit(X_train, y_train)

In [36]:
# check r_sq for training and testing set 
print('training set r-sq: {}'.format(lasso1.score(X_train, y_train)))
print('testing set r-sq: {}'.format(lasso1.score(X_test, y_test)))

training set r-sq: 0.6954083517897602
testing set r-sq: 0.6102987404698923


In [37]:
# check features used in the model 
np.sum(lasso1.coef_ !=0)

10

In [38]:
# apply lasso regression to boston data set with origianl features plus interaction features (104 in total)
lasso2 = Lasso().fit(X2_train, y2_train)

In [39]:
# check r_sq for training and testing set 
print('training set r-sq: {}'.format(lasso2.score(X2_train, y2_train)))
print('testing set r-sq: {}'.format(lasso2.score(X2_test, y2_test)))

training set r-sq: 0.2584676947577571
testing set r-sq: 0.23375027911619994


# very low r-sq values in both training set and testing set suggests underfitting

In [56]:
## check numbers of features used in the model 
np.sum(lasso2.coef_ != 0)

2

# Only 2 features are used in the model, which is the cause of underfitting 

In [54]:
# try with lower aplpha to increase model complexity therefore having more features in the model 
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X2_train, y2_train)

In [55]:
# check r_sq for training and testing set 
print(lasso001.score(X2_train, y2_train))
print(lasso001.score(X2_test, y2_test))

0.8793119562377951
0.8572291612670798


In [57]:
#check numbers of features used in the model 
np.sum(lasso001.coef_ != 0)

33

lower alpha increase model complexity and results in better fitting for both training and testing sets

In [79]:
lasso0001 = Lasso(alpha=0.001, max_iter=100000).fit(X2_train, y2_train)
print(lasso0001.score(X2_train, y2_train))
print(lasso0001.score(X2_test, y2_test))

0.9300716024941037
0.7920736003201351


In [80]:
#check numbers of features used in the model 
np.sum(lasso0001.coef_ != 0)

65

further decrease in alpha results in overfitting with more features being used in the model