In [38]:
# Import libraries
import numpy as np
import pandas as pd
import sklearn

from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 

In [39]:
# Make dataframe out of boston housing dataset
boston = load_boston()

bos = pd.DataFrame(boston.data)
bos.columns = boston.feature_names
bos['PRICE'] = boston.target

bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [40]:
# Make dependent and independent variables
X = bos.drop("PRICE", axis = 1)
Y = bos.PRICE

In [41]:
# Function for Mean Square Error
def MSE(actual, predicted):
    mse = np.average((actual - predicted) ** 2)
    return mse

In [48]:
# Split into test and train datasets
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
    X, bos.PRICE, test_size=0.33, random_state = 5)

### Using simple test / train splitting first

In [57]:
lm = LinearRegression()

fit = lm.fit(X = X_train, y = Y_train)
predicted_train = fit.predict(X_train)
predicted_test = fit.predict(X_test)

print ("Mean Square Error (Train) = " + str(MSE(Y_train, predicted_train)))
print ("Mean Square Error (Test)= " + str(MSE(Y_test, predicted_test)))

Mean Square Error (Train) = 19.5467584735
Mean Square Error (Test)= 28.5413672756


### Using cross validation 

#### 1. Using cross validation on above training dataset

In [66]:
CV = KFold(n_splits = 5) 

scores_train = cross_val_score(estimator=lm, X=X_train, y=Y_train, scoring="neg_mean_squared_error", cv=CV) 

print("Folds: %i, mean squared error: %.2f std: %.2f" % (len(scores_train),np.mean(np.abs(scores_train)),np.std(scores_train))) 
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_train.mean(), scores_train.std() * 2))
print(scores_train)

Folds: 5, mean squared error: 21.98 std: 2.74
Accuracy: -21.98 (+/- 5.47)
[-20.05238867 -19.78792277 -21.49948113 -27.28581566 -21.26483846]


The MSE is close to the value above...

#### 2. Using cross validation on full dataset

In [67]:
CV = KFold(n_splits = 5) 

scores_all = cross_val_score(estimator=lm, X=X, y=Y, scoring="neg_mean_squared_error", cv=CV)

print("Folds: %i, mean squared error: %.2f std: %.2f"% (len(scores_all),np.mean(np.abs(scores_all)),np.std(scores_all))) 
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_all.mean(), scores_all.std() * 2))
print(scores_all)

Folds: 5, mean squared error: 37.22 std: 23.10
Accuracy: -37.22 (+/- 46.20)
[-12.48065021 -26.09620267 -33.11995587 -80.83305378 -33.58435565]


This is much higher why ?

### Validating cross validation manually

In [46]:
mean_square_error = []

for train,test in CV.split(X):
    X_train = X[X.index.isin(train)]
    X_test = X[X.index.isin(test)]
    
    Y_train = Y[Y.index.isin(train)]
    Y_test = Y[Y.index.isin(test)]
    
    fit = lm.fit(X_train, Y_train)
    predicted = fit.predict(X=X_test) ## Out of sample
    
    mean_square_error.append(MSE(Y_test, predicted))
    
print("Average MSE = " + str(np.average(mean_square_error)))

Average MSE = 37.2228436371


In [78]:
for i in range(10):

        print('  ')
        print('------------------------------------------------------------------------')
        print(i)
        print('------------------------------------------------------------------------')
        print('  ')
        
        # shuffle df each time
        bos = bos.sample(frac=1)
        # Make dependent and independent variables
        X = bos.drop("PRICE", axis = 1)
        Y = bos.PRICE

        # Split into test and train datasets
        X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
            X, Y, test_size=0.33)
        
        lm = LinearRegression()
        
        print('##############################################################')
        print('Using simple test / train splitting first')
        print('##############################################################')
        fit = lm.fit(X = X_train, y = Y_train)
        predicted_train = fit.predict(X_train)
        predicted_test = fit.predict(X_test)

        print ("Mean Square Error (Train) = " + str(MSE(Y_train, predicted_train)))
        print ("Mean Square Error (Test)= " + str(MSE(Y_test, predicted_test)))
        
        print('##############################################################')
        print('Using cross validation on above training dataset')
        print('##############################################################')
        
        CV = KFold(n_splits = 5) 

        scores_train = cross_val_score(estimator=lm, X=X_train, y=Y_train, scoring="neg_mean_squared_error", cv=CV) 

        print("Folds: %i, mean squared error: %.2f std: %.2f" % (len(scores_train),np.mean(np.abs(scores_train)),np.std(scores_train))) 
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores_train.mean(), scores_train.std() * 2))
        print(scores_train)
        
        print('##############################################################')
        print('Using cross validation on full dataset')
        print('##############################################################')
        
        CV = KFold(n_splits = 5) 

        scores_all = cross_val_score(estimator=lm, X=X, y=Y, scoring="neg_mean_squared_error", cv=CV)

        print("Folds: %i, mean squared error: %.2f std: %.2f"% (len(scores_all),np.mean(np.abs(scores_all)),np.std(scores_all))) 
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores_all.mean(), scores_all.std() * 2))
        print(scores_all)
        
        

  
------------------------------------------------------------------------
0
------------------------------------------------------------------------
  
##############################################################
Using simple test / train splitting first
##############################################################
Mean Square Error (Train) = 24.4030678831
Mean Square Error (Test)= 18.0845848901
##############################################################
Using cross validation on above training dataset
##############################################################
Folds: 5, mean squared error: 27.46 std: 3.98
Accuracy: -27.46 (+/- 7.96)
[-25.09850307 -31.42202802 -20.79251409 -30.49272061 -29.51410281]
##############################################################
Using cross validation on full dataset
##############################################################
Folds: 5, mean squared error: 24.29 std: 7.85
Accuracy: -24.29 (+/- 15.71)
[-15.378112   -18.49414428 -31.65314813 

Folds: 5, mean squared error: 23.63 std: 5.88
Accuracy: -23.63 (+/- 11.76)
[-17.27130347 -30.63129996 -16.48954918 -29.24971179 -24.51652256]
  
------------------------------------------------------------------------
9
------------------------------------------------------------------------
  
##############################################################
Using simple test / train splitting first
##############################################################
Mean Square Error (Train) = 19.1332768636
Mean Square Error (Test)= 28.2827889321
##############################################################
Using cross validation on above training dataset
##############################################################
Folds: 5, mean squared error: 21.39 std: 8.14
Accuracy: -21.39 (+/- 16.28)
[-15.95624702 -17.98675506 -14.80568947 -37.09350131 -21.10743514]
##############################################################
Using cross validation on full dataset
###################################

So the calculation above seems to be accurate.