# Cross Validation and the Bootstrap

In [1]:
import numpy as np
import statsmodels.api as sm

from ISLP import load_data
from ISLP.models import (ModelSpec as MS , summarize, poly)
from ISLP.models import sklearn_sm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import (cross_validate, KFold, ShuffleSplit)
from sklearn.base import clone 

from functools import partial


## The Validation Set Approach 
- Recall that the **Validaiton Set Approach** is just dividing the observations $n$ into two parts
- Training Set
- Validation Set 

In [2]:
Auto = load_data('Auto')
Auto_train, Auto_test = train_test_split(Auto,test_size=196,random_state=10)

- Loading the **Auto** data set which contain $392$
- Splitting the dataset into two equal parts 

In [32]:
hp_mm = MS(['horsepower'])

X_train = hp_mm.fit_transform(Auto_train)
Y_train = Auto_train['mpg']

simple_LR = sm.OLS(Y_train,X_train)
results = simple_LR.fit()
summarize(results)

Unnamed: 0,coef,std err,t,P>|t|
intercept,41.073,1.031,39.825,0.0
horsepower,-0.1632,0.009,-17.913,0.0


- After fiting a **Simple Linear Regression** model using the **training set**
- We validate the model using the **test/Validate set** with the `predict()` method

In [36]:
X_test  = hp_mm.transform(Auto_test)
y_test = Auto_test['mpg']
valid_pred= results.predict(X_test)
np.mean((y_test-valid_pred)**2)

23.060588342506232

- The test **MSE** is estimated to be around $23.060$

In [39]:
def evalMSE(terms,response,train,test):
    mm = MS(terms)
    X_train =mm.fit_transform(train)
    Y_train = train[response]

    X_test = mm.transform(test)
    Y_test = test[response]

    results = sm.OLS(Y_train,X_train).fit()
    test_pred = results.predict(X_test)
    return np.mean((Y_test-test_pred)**2)

- Now we use the function to estimate the Validation **MSE** for quadratic and cubic fits 

In [41]:
MSE = np.zeros(10)

for idx, degree in enumerate (range(1,11)):
     MSE[idx] = evalMSE([poly('horsepower',degree)],'mpg',Auto_train,Auto_test)

MSE

array([23.06058834, 19.71779411, 19.70841616, 19.70657244, 19.26372691,
       19.36922139, 19.61616556, 19.64524776, 19.75034445, 20.12770933])

- Running the function on different **train and test** sets will yield slightly different results each time 

## Cross-Validation

In [46]:
hp_model = sklearn_sm(sm.OLS,MS(['horsepower']))

X,Y= Auto.drop(columns=['mpg']),Auto['mpg']
X

Unnamed: 0_level_0,cylinders,displacement,horsepower,weight,acceleration,year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
chevrolet chevelle malibu,8,307.0,130,3504,12.0,70,1
buick skylark 320,8,350.0,165,3693,11.5,70,1
plymouth satellite,8,318.0,150,3436,11.0,70,1
amc rebel sst,8,304.0,150,3433,12.0,70,1
ford torino,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...
ford mustang gl,4,140.0,86,2790,15.6,82,1
vw pickup,4,97.0,52,2130,24.6,82,2
dodge rampage,4,135.0,84,2295,11.6,82,1
ford ranger,4,120.0,79,2625,18.6,82,1


In [47]:
Y

name
chevrolet chevelle malibu    18.0
buick skylark 320            15.0
plymouth satellite           18.0
amc rebel sst                16.0
ford torino                  17.0
                             ... 
ford mustang gl              27.0
vw pickup                    44.0
dodge rampage                32.0
ford ranger                  28.0
chevy s-10                   31.0
Name: mpg, Length: 392, dtype: float64

In [58]:
cv_results = cross_validate(hp_model,X,Y,cv=Auto.shape[0])
cv_err = np.mean(cv_results['test_score'])
cv_err

24.23151351792922

In [57]:
for col in cv_results:
    print(col)

fit_time
score_time
test_score


- The `cv=Auto.shape[0]` is the $K$ number of folds, Since we provided all the $n$ observations it applied the **LOOCV**

In [62]:
cv_error = np.zeros(7)

H = np.array(Auto['horsepower'])
M= sklearn_sm(sm.OLS)

for i, d in enumerate(range(1,8)):
    X = np.power.outer(H,np.arange(d+1))
    print(X)
    M_CV =cross_validate(M,X,Y,cv=Auto.shape[0])

    cv_error[i]= np.mean(M_CV['test_score'])

cv_error

[[  1 130]
 [  1 165]
 [  1 150]
 [  1 150]
 [  1 140]
 [  1 198]
 [  1 220]
 [  1 215]
 [  1 225]
 [  1 190]
 [  1 170]
 [  1 160]
 [  1 150]
 [  1 225]
 [  1  95]
 [  1  95]
 [  1  97]
 [  1  85]
 [  1  88]
 [  1  46]
 [  1  87]
 [  1  90]
 [  1  95]
 [  1 113]
 [  1  90]
 [  1 215]
 [  1 200]
 [  1 210]
 [  1 193]
 [  1  88]
 [  1  90]
 [  1  95]
 [  1 100]
 [  1 105]
 [  1 100]
 [  1  88]
 [  1 100]
 [  1 165]
 [  1 175]
 [  1 153]
 [  1 150]
 [  1 180]
 [  1 170]
 [  1 175]
 [  1 110]
 [  1  72]
 [  1 100]
 [  1  88]
 [  1  86]
 [  1  90]
 [  1  70]
 [  1  76]
 [  1  65]
 [  1  69]
 [  1  60]
 [  1  70]
 [  1  95]
 [  1  80]
 [  1  54]
 [  1  90]
 [  1  86]
 [  1 165]
 [  1 175]
 [  1 150]
 [  1 153]
 [  1 150]
 [  1 208]
 [  1 155]
 [  1 160]
 [  1 190]
 [  1  97]
 [  1 150]
 [  1 130]
 [  1 140]
 [  1 150]
 [  1 112]
 [  1  76]
 [  1  87]
 [  1  69]
 [  1  86]
 [  1  92]
 [  1  97]
 [  1  80]
 [  1  88]
 [  1 175]
 [  1 150]
 [  1 145]
 [  1 137]
 [  1 150]
 [  1 198]
 [  1 150]

array([24.23151352, 19.24821312, 19.33498406, 19.42443029, 19.03320648,
       19.00693693, 18.99513648])