In [2]:
from sklearn import datasets
import pandas as pd

boston = datasets.load_boston()
bostonDF = pd.DataFrame(boston.data, columns = boston.feature_names)
bostonDF['target'] = boston.target
print(bostonDF.shape)
print(bostonDF.head(5))

(506, 14)
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  target  
0     15.3  396.90   4.98    24.0  
1     17.8  396.90   9.14    21.6  
2     17.8  392.83   4.03    34.7  
3     18.7  394.63   2.94    33.4  
4     18.7  396.90   5.33    36.2  


In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(bostonDF.drop('target', axis=1), bostonDF['target'],
                                                   test_size = 0.2, random_state = 1)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_train.shape)

(404, 13) (404,)
(102, 13) (404,)
         CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS   RAD    TAX  \
42    0.14150   0.0   6.91   0.0  0.448  6.169   6.6  5.7209   3.0  233.0   
58    0.15445  25.0   5.13   0.0  0.453  6.145  29.2  7.8148   8.0  284.0   
385  16.81180   0.0  18.10   0.0  0.700  5.277  98.1  1.4261  24.0  666.0   
78    0.05646   0.0  12.83   0.0  0.437  6.232  53.7  5.0141   5.0  398.0   
424   8.79212   0.0  18.10   0.0  0.584  5.565  70.6  2.0635  24.0  666.0   
..        ...   ...    ...   ...    ...    ...   ...     ...   ...    ...   
255   0.03548  80.0   3.64   0.0  0.392  5.876  19.1  9.2203   1.0  315.0   
72    0.09164   0.0  10.81   0.0  0.413  6.065   7.8  5.2873   4.0  305.0   
396   5.87205   0.0  18.10   0.0  0.693  6.405  96.0  1.6768  24.0  666.0   
235   0.33045   0.0   6.20   0.0  0.507  6.086  61.5  3.6519   8.0  307.0   
37    0.08014   0.0   5.96   0.0  0.499  5.850  41.5  3.9342   5.0  279.0   

     PTRATIO       B  LSTAT  
42      17.

# Linear Regression Algorithm
# Validation Set Approach

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np


# 1. Build The Model
# 2. model.fit(features, target)
# 3. model.predict(testSetFeature)


linRegModel = LinearRegression()
saveModel = linRegModel.fit(X_train, Y_train)

print('w0 = ', linRegModel.intercept_)
print('w1 ... w13 = ', saveModel.coef_)
print()
print()

Y_train_predicted = linRegModel.predict(X_train)
rmse = np.sqrt( mean_squared_error(Y_train, Y_train_predicted) )
print('RMSE score in Training set: {}'.format(rmse))
r2 = r2_score(Y_train, Y_train_predicted)
print( 'R^2 score in Training Set: {}'.format(r2) )
print()

Y_test_predicted = linRegModel.predict(X_test)
rmse = np.sqrt( mean_squared_error(Y_test, Y_test_predicted) )
print('RMSE score in Test Set: {}'.format(rmse))
r2 = r2_score(Y_test, Y_test_predicted)
print('R^2 score in Test Set: {}'.format(r2))
print()



w0 =  42.93352585337733
w1 ... w13 =  [-1.12386867e-01  5.80587074e-02  1.83593559e-02  2.12997760e+00
 -1.95811012e+01  3.09546166e+00  4.45265228e-03 -1.50047624e+00
  3.05358969e-01 -1.11230879e-02 -9.89007562e-01  7.32130017e-03
 -5.44644997e-01]


RMSE score in Training set: 4.675766751547773
R^2 score in Training Set: 0.7293585058196337

RMSE score in Test Set: 4.835373458200553
R^2 score in Test Set: 0.7634174432138457



# Cross Validation Approach 

In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

folds = KFold(n_splits = 4, shuffle = True, random_state = 1)

#cross_val_score(model, X, Y, cv=folds, scoring='neg_mean_absolute_error')

crossValScore = cross_val_score(linRegModel, X_train, Y_train, cv = folds, scoring = 'neg_mean_squared_error')
print(crossValScore) #score at each folds
meanCrossValScore = crossValScore.mean()
print('Cross Validation Score with MSE in Training Set: {}'.format(meanCrossValScore))
crossValScore = cross_val_score(linRegModel, X_train, Y_train, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Training Set: {}'.format(crossValScore.mean()))
print()


crossValScore = cross_val_score(linRegModel, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(linRegModel, X_test, Y_test, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Test Set: {}'.format(crossValScore.mean()))
print()

[-20.83773151 -15.41295672 -29.78012867 -32.08728107]
Cross Validation Score with MSE in Training Set: -24.529524493812357
Cross Validation Score with R^2 in Training Set: 0.6989588876671124

Cross Validation Score with MSE in Test Set: -28.662374571168268
Cross Validation Score with R^2 in Test Set: 0.6766217788953366



# Ridge Regression
# cross Validation Approach only(no validation set, because data is big - 400 sets)

In [25]:
from sklearn.linear_model import Ridge

model = Ridge()
model.fit(X_train, Y_train)

#print(model.intercept_, model.coef_)

crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Training Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Training Set: {}'.format(crossValScore.mean()))
print()

crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Test Set: {}'.format(crossValScore.mean()))
print()

Cross Validation Score with MSE in Training Set: -25.014634364292743
Cross Validation Score with R^2 in Training Set: 0.6926455262725364

Cross Validation Score with MSE in Test Set: -25.972122941254042
Cross Validation Score with R^2 in Test Set: 0.7097149045678304



# Lasso Regression

In [26]:
from sklearn.linear_model import Lasso

model = Lasso()
model.fit(X_train, Y_train)

#print(model.intercept_, model.coef_)

crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Training Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Training Set: {}'.format(crossValScore.mean()))
print()

crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Test Set: {}'.format(crossValScore.mean()))
print()

Cross Validation Score with MSE in Training Set: -29.151433292468568
Cross Validation Score with R^2 in Training Set: 0.644588731624381

Cross Validation Score with MSE in Test Set: -30.35053330581816
Cross Validation Score with R^2 in Test Set: 0.674430986870033



# Quick Note : actual code will be like -

from sklearn.linear_model import Lasso

model = Lasso()
model.fit(X_train, Y_train)

crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))

# -> only test set errors
# -> either only neg_mean_squared_error or r2 error

# K Nearest Neighbor

In [31]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor()
model.fit(X_train, Y_train)

crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Training Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Training Set: {}'.format(crossValScore.mean()))
print()

crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Test Set: {}'.format(crossValScore.mean()))
print()

Cross Validation Score with MSE in Training Set: -47.3831099009901
Cross Validation Score with R^2 in Training Set: 0.4078969671370324

Cross Validation Score with MSE in Test Set: -78.32420815384614
Cross Validation Score with R^2 in Test Set: 0.13896471761848012



# Decision Tree

In [32]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, Y_train)

crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Training Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Training Set: {}'.format(crossValScore.mean()))
print()

crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Test Set: {}'.format(crossValScore.mean()))
print()

Cross Validation Score with MSE in Training Set: -23.98542079207921
Cross Validation Score with R^2 in Training Set: 0.7006595798619528

Cross Validation Score with MSE in Test Set: -33.175873076923075
Cross Validation Score with R^2 in Test Set: 0.6119848807567507



# Support Vector Regressor

In [33]:
from sklearn.svm import SVR

model = SVR()
model.fit(X_train, Y_train)

crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Training Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_train, Y_train, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Training Set: {}'.format(crossValScore.mean()))
print()

crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'neg_mean_squared_error')
print('Cross Validation Score with MSE in Test Set: {}'.format(crossValScore.mean()))
crossValScore = cross_val_score(model, X_test, Y_test, cv = folds, scoring = 'r2')
print('Cross Validation Score with R^2 in Test Set: {}'.format(crossValScore.mean()))
print()

Cross Validation Score with MSE in Training Set: -64.85454019698284
Cross Validation Score with R^2 in Training Set: 0.21368639258089875

Cross Validation Score with MSE in Test Set: -85.88483321898642
Cross Validation Score with R^2 in Test Set: 0.11244969336305827

