In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold

from IPython.display import Image
from IPython.core.display import HTML 

import seaborn as sns

pd.set_option('display.max_colwidth', 100)

In [2]:
HP = 'Engine HP'
PRICE = 'MSRP'

# Import data
df = pd.read_csv('./data/data.csv')
# Source: https://www.kaggle.com/CooperUnion/cardataset
# Remove null engine hp 
df = df[~df['Engine HP'].isnull()]
df = df[~df['Engine Cylinders'].isnull()]

# Train Test Split
- When we create a model its a common practice to train it using a random subset of you data also known as train set, and score the model using the remmaning data test set. This will allow us to see how the model would perform on unseen data. This procedure will give us indications if model is over or under fit. 


In [3]:
features = ['highway MPG', 'Year', 'Engine HP', 'Popularity']
X = df[features]
y = df['MSRP']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [23]:
lr = LinearRegression(fit_intercept=False)
lr.fit(X_train, y_train)
# # R2 of trained data
# y_hat = lr.predict(X_train).
# print('R2 train score:', r2_score(y_train, y_hat))
# # R2 of test data
# y_hat = model2.predict(X_test)
# print('R2 test score:', r2_score(y_test, y_hat))

R2 train score: 0.4484541951684514


# K-Fold Cross validation
- Is a way of checking if we by change got a bad split on the Train, Test Split procedure or if we are in fact over or underfitting our model.

![Image](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)
Source: https://scikit-learn.org/stable/modules/cross_validation.html

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5165)

lr = LinearRegression(fit_intercept=False)
lr.fit(X_train, y_train)
train_score = lr.score(X_train, y_train)
tst = lr.score(X_test, y_test)
print(f'Train score {train_score}')
print(f'Test score {tst}')

Train score 0.5615117467662802
Test score 0.33711979179458174


The numbers above indicate a model that is overfitting.

If we run a Cross Validation with 3 fold we will find indications that these train and test scores are off.

In [96]:
print(cross_val_score(lr, X, y, cv=3).mean())

0.4509412515731697


Now Running lets change the train_test_split random state to 42 and re train and score the model.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
lr = LinearRegression(fit_intercept=False)
lr.fit(X_train, y_train)
train_score = lr.score(X_train, y_train)
tst = lr.score(X_test, y_test)
print(f'Train score {train_score}')
print(f'Test score {tst}')

Train score 0.4484541951684514
Test score 0.45609982727210163


Now we get results that are much more in line with the cross validation score.

In [5]:
# lr = LinearRegression(fit_intercept=False)
# print(cross_val_score(lr, X, y, cv=3))
print(cross_val_score(lr, X, y, cv=3).mean())

0.4509412515731697


We can also personalize the kfold like this:

In [101]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
print(cross_val_score(lr, X, y, cv=kf))

[0.46534069 0.41217911 0.47926784]
