# Linear Regression - K-fold validation

In this notebook we'll use k-fold cross-validation on our data-set

In [1]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

boston = load_boston()
boston_df = pd.DataFrame(data=boston['data'], columns=boston['feature_names'])

X = boston_df[['RM', 'LSTAT', 'NOX']]
y = boston['target']

print("The number of rows is:", boston_df.shape[0])
print("The number of columns is:", boston_df.shape[1])

The number of rows is: 506
The number of columns is: 13


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=3)

In [17]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train, y_train)

mae_train = mean_absolute_error(y_train, model.predict(X_train))
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print("Training set MAE: {:.2f}".format(mae_train*1000))
print("Test set MAE: {:.2f}".format(mae_test*1000))

Training set MAE: 4036.52
Test set MAE: 3728.10


### K-fold cross validation


In [19]:
from sklearn.model_selection import cross_val_score
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')

In [20]:
_ = [print("{:.2f}".format(i)) for i in np.abs(cv_scores*1000)]

2987.75
3685.92
4969.56
5649.60
5296.57


In [22]:
np.abs(np.mean(cv_scores)*1000)

4517.878142239425

In [23]:
4517 - 3728

789