- [Load dataset](#Load-dataset)
- [Cross-Validation](#5.1-Cross-Validation)

# Chapter 5 - Resampling Methods

In [None]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

# pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

### Load dataset

Dataset available on http://www-bcf.usc.edu/~gareth/ISL/data.html

In [None]:
df1 = pd.read_csv('../../_data/Auto.csv', na_values='?').dropna()
df1.info()

## 5.1 Cross-Validation

### Validation Set Approach (Figure 5.2)
Using Polynomial feature generation in scikit-learn<BR>
http://scikit-learn.org/dev/modules/preprocessing.html#generating-polynomial-features

In [None]:
test_prop = 0.5
p_order = np.arange(1, 11)
r_state = np.arange(10)

X, Y = np.meshgrid(p_order, r_state, indexing='ij')
Z = np.zeros((p_order.size, r_state.size))

regr = LinearRegression()

# Generate 10 random splits of the dataset along 10 different polynomials
for (i,j), _ in np.ndenumerate(Z):
    
    # Transform to polynomial
    poly = PolynomialFeatures(int(X[i,j])) # 1,1,1,...,10,10,10
    X_poly = poly.fit_transform(df1.horsepower.values.reshape(-1, 1))
    
    X_train, X_test, y_train, y_test = train_test_split(X_poly, df1.mpg.ravel(),
                                                        test_size=test_prop, random_state=Y[i,j]) # 0,0,0,...,9,9,9
                                                                        
    _ = regr.fit(X_train, y_train)
    pred = regr.predict(X_test)
    Z[i,j] = mean_squared_error(y_test, pred)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 6))

# Left plot (first split)
_ = ax1.plot(X.T[0], Z.T.mean(0), '-o')
_ = ax1.set_title('Mean of random splits of the data set')

# Right plot (all splits)
_ = ax2.plot(X,Z)
_ = ax2.set_title('10 random splits of the data set')

for ax in fig.axes:
    _ = ax.set_ylabel('Mean Squared Error')
    _ = ax.set_ylim(15, 30)
    _ = ax.set_xlabel('Degree of Polynomial')
    _ = ax.set_xlim(0.5, 10.5)
    _ = ax.set_xticks(range(2,11,2));

### LOOCV (Figure 5.4)

In [None]:
p_order = np.arange(1, 11)
r_state = np.arange(0, 10)

# LeaveOneOut CV
regr = LinearRegression()
loo = LeaveOneOut().get_n_splits(df1)
scores = list()

for i in p_order:
    poly = PolynomialFeatures(i)
    X_poly = poly.fit_transform(df1.horsepower.values.reshape(-1,1))
    score = cross_val_score(regr, X_poly, df1.mpg, cv=loo, scoring='neg_mean_squared_error').mean()
    scores.append(score)
scores

### k-Fold Cross Validation

In [None]:
# 
folds = 10
elements = len(df1.index)

X, Y = np.meshgrid(p_order, r_state, indexing='ij')
Z = np.zeros((p_order.size, r_state.size))

regr = LinearRegression()

for (i,j), _ in np.ndenumerate(Z):
    
    poly = PolynomialFeatures(X[i,j])
    X_poly = poly.fit_transform(df1.horsepower.values.reshape(-1, 1))
    
    kf = KFold(n_splits=10, random_state=Y[i,j]).get_n_splits(df1)
    
    Z[i,j] = cross_val_score(regr, X_poly, df1.mpg, cv=kf, scoring='neg_mean_squared_error').mean()    

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10, 4))

# Note: cross_val_score() method return negative values for the scores.
# https://github.com/scikit-learn/scikit-learn/issues/2439

# Left plot
_ = ax1.plot(p_order, np.array(scores)*-1, '-o')
_ = ax1.set_title('LOOCV')

# Right plot
_ = ax2.plot(X, Z*-1, '-o')
_ = ax2.set_title('10-fold CV')

for ax in fig.axes:
    _ = ax.set_ylabel('Mean Squared Error')
    _ = ax.set_ylim(15, 30)
    _ = ax.set_xlabel('Degree of Polynomial')
    _ = ax.set_xlim(0.5, 10.5)
    _ = ax.set_xticks(range(2, 11, 2));