In [1]:
#Lab 5 
#Housekeeping
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
from sklearn.model_selection import train_test_split

from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm
import pandas as pd

In [2]:
#Question 1

Boston = load_data('Boston')
Boston_train, Boston_test = train_test_split(Boston,
                                         test_size=253,
                                         random_state=42)

lstatmm = MS(['lstat'])
X_train = lstatmm.fit_transform(Boston_train)
y_train = Boston_train['medv']
model = sm.OLS(y_train, X_train)
results = model.fit()


In [3]:
#Question 1 (continued)
def evalMSE(terms,
            response,
            train,
            test): 

   mm = MS(terms) #create model spec object
   X_train = mm.fit_transform(train) #fit and transform train set
   y_train = train[response] #get response variable

   X_test = mm.transform(test) #transform test set
   y_test = test[response] #get response variable

   results = sm.OLS(y_train, X_train).fit() #estimate model 
   test_pred = results.predict(X_test) #get predictions on test set

   return np.mean((y_test - test_pred)**2) #return MSE


In [4]:
#Question 1 (continued)
MSE = np.zeros(4) #array to store MSE values
for idx, degree in enumerate(range(1, 5)): #loop over polynomial degrees 1, 2, 3, 4
    MSE[idx] = evalMSE([poly('lstat', degree)], 
                       'medv',
                       Boston_train,
                       Boston_test) #evaluate MSE for each polynomial degree
print(np.round(MSE, 2)) 

[38.51 30.84 29.22 27.75]


In [5]:
#Question 2
College = load_data('College')

College = College[['Outstate' , 'Room.Board']]
X, Y = College.drop(columns=['Room.Board']), College['Room.Board']

cv_error = np.zeros(5) #array to store CV errors
H = np.array(College['Outstate']) #predictor
M = sklearn_sm(sm.OLS)  #create sklearn-compatible model
for i, d in enumerate(range(1,6)): #loop over polynomial degrees 1 to 5
    X = np.power.outer(H, np.arange(d+1)) #create polynomial features
    M_CV = cross_validate(M, 
                          X,
                          Y,
                          cv=College.shape[0]) #LOOCV
    cv_error[i] = np.mean(M_CV['test_score']) #average test MSE
print([f"{val:.2f}" for val in np.round(cv_error, 2)]) #display rounded CV errors

['690682.00', '688952.83', '690457.16', '695841.31', '965400.35']


In [6]:
#Question 3
cv_error5fold = np.zeros(3)
cv = KFold(n_splits=5,
           shuffle=True,
           random_state=123)

for i, d in enumerate(range(1,4)): 
    X = np.power.outer(H, np.arange(d+1)) 
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=cv) #calls the cv object defined earlier as the cross-validation strategy
    cv_error5fold[i] = np.mean(M_CV['test_score']) 

cv_error10fold = np.zeros(3)
cv = KFold(n_splits=10,
           shuffle=True,
           random_state=123)

for i, d in enumerate(range(1,4)): 
    X = np.power.outer(H, np.arange(d+1)) 
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=cv) #calls the cv object defined earlier as the cross-validation strategy
    cv_error10fold[i] = np.mean(M_CV['test_score']) 

Kfold_error_array =np.vstack((cv_error5fold, cv_error10fold))
df = pd.DataFrame(
    np.round(Kfold_error_array, 3),
    index=['5-Fold CV', '10-Fold CV'],
    columns=['Degree 1 CV Error', 'Degree 2 CV Error', 'Degree 3 CV Error']
)

print(df)  # prints as a labeled table 

            Degree 1 CV Error  Degree 2 CV Error  Degree 3 CV Error
5-Fold CV          691473.994         693003.615         693543.866
10-Fold CV         690504.554         690594.856         691610.998


In [7]:
#Question 3 (Continued)
Kfold_error_difference = np.vstack((cv_error5fold - cv_error10fold))
df2 = pd.DataFrame(np.round(Kfold_error_difference, 3), index =['Degree 1 Difference', 'Degree 2 Difference', 'Degree 3 Difference'], columns =['CV error Difference (5 fold - 10 fold)'])
print(df2)

                     CV error Difference (5 fold - 10 fold)
Degree 1 Difference                                 969.440
Degree 2 Difference                                2408.759
Degree 3 Difference                                1932.868


In [8]:
#Question 4
Default = load_data('Default')
Default = Default[['balance' , 'income']]

def alpha_func(Default, idx): #function to compute alpha
   cov_ = np.cov(Default[['balance','income']].loc[idx], rowvar=False) #covariance matrix
   return ((cov_[1,1] - cov_[0,1]) /
           (cov_[0,0]+cov_[1,1]-2*cov_[0,1]))

rng = np.random.default_rng(456) #random number generator
alpha_func(Default,
           rng.choice(2000,
                      2000,
                      replace=True)) 

def boot_SE(func,
            Default,
            n=None,
            B=2000,
            seed=456): #function to compute bootstrap standard error
    rng = np.random.default_rng(seed) #random number generator
    first_, second_ = 0, 0 #initialize accumulators
    n = n or Default.shape[0] #set n to number of observations if not provided
    for _ in range(B): #bootstrap iterations
        idx = rng.choice(Default.index,  
                         n,
                         replace=True) #bootstrap sample indices
        value = func(Default, idx) #compute statistic
        first_ += value #accumulate first moment
        second_ += value**2 #accumulate second moment
    return np.sqrt(second_ / B - (first_ / B)**2) #return standard error

alpha_SE = boot_SE(alpha_func,
                   Default,
                   B=2000,
                   seed=456) #bootstrap standard error


alpha_val = alpha_func(Default, Default.index)
df = pd.DataFrame(
    [[np.round(alpha_SE, 4), np.round(alpha_val, 4)]],
    index=['Value'],
    columns=['Bootstrapped SE of alpha', 'Correlation coefficient']
)
print(df)

       Bootstrapped SE of alpha  Correlation coefficient
Value                    0.0004                   0.9932


In [9]:
#Question 5
Wage = load_data('Wage')
Wage = Wage[['age', 'wage']]

se_ols_list = []
for deg in [1, 2]:
    mm = MS([poly('age', deg)])
    X = mm.fit_transform(Wage)
    y = Wage['wage']
    model = sm.OLS(y, X)
    results = model.fit()
    se_ols = summarize(results)['std err']
    se_ols_list.append(se_ols)

print("Traditional OLS SE for degree 1:", np.round(se_ols_list[0], 3))
print("Traditional OLS SE for degree 2:", np.round(se_ols_list[1], 3))

Traditional OLS SE for degree 1: intercept               0.747
poly(age, degree=1)    40.929
Name: std err, dtype: float64
Traditional OLS SE for degree 2: intercept                  0.730
poly(age, degree=2)[0]    39.993
poly(age, degree=2)[1]    39.993
Name: std err, dtype: float64


In [11]:
degrees = [1, 2]
wage_se_list = []

def boot_OLS(degree, response, df, idx):
    mm = MS([poly('age', degree)]) 
    X = mm.fit_transform(df.loc[idx])
    y = df.loc[idx, response]
    model = sm.OLS(y, X)
    results = model.fit()
    return results.params

for deg in degrees:
    wage_func = lambda df, idx, d=deg: boot_OLS(d, 'wage', df, idx)
    se = boot_SE(wage_func, Wage, B=1500, seed=789)
    wage_se_list.append(se)

print("Bootstrap SE for degree 1:", np.round(wage_se_list[0], 3))
print("Bootstrap SE for degree 2:", np.round(wage_se_list[1], 3))

Bootstrap SE for degree 1: intercept               0.792
poly(age, degree=1)    39.021
dtype: float64
Bootstrap SE for degree 2: intercept                  0.792
poly(age, degree=2)[0]    39.021
poly(age, degree=2)[1]    35.534
dtype: float64


In [12]:
#Question 5 (continued)
ratio_deg1 = np.nan_to_num(wage_se_list[0] / se_ols_list[0], nan=np.nan, posinf=np.nan, neginf=np.nan)
ratio_deg2 = np.nan_to_num(wage_se_list[1] / se_ols_list[1], nan=np.nan, posinf=np.nan, neginf=np.nan)

df_ratio_deg1 = pd.DataFrame(
    [ratio_deg1],
    index=["Degree 1"],
    columns=[f"Coef {i}" for i in range(len(ratio_deg1))]
)
df_ratio_deg2 = pd.DataFrame(
    [ratio_deg2],
    index=["Degree 2"],
    columns=[f"Coef {i}" for i in range(len(ratio_deg2))]
)

print("Ratio of Bootstrap SE to OLS SE for degree 1 coefficients:")
print(np.round(df_ratio_deg1, 4))
print("Ratio of Bootstrap SE to OLS SE for degree 2 coefficients:")
print(np.round(df_ratio_deg2, 4))

Ratio of Bootstrap SE to OLS SE for degree 1 coefficients:
          Coef 0  Coef 1
Degree 1  1.0606  0.9534
Ratio of Bootstrap SE to OLS SE for degree 2 coefficients:
          Coef 0  Coef 1  Coef 2
Degree 2  1.0853  0.9757  0.8885
