In [22]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import sklearn.linear_model as lm
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.gridspec as gridspec
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier

# Plotting style
sns.set_style('darkgrid')
sns.set_theme(font_scale=1.)

# Define parameters
seed = 1234

In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge

# Plotting style
sns.set_style('darkgrid')
sns.set_theme(font_scale=1.)

def setup_storage_for_experiment(K_outer, K_inner, num_hyperparams):
    # Setup storage for the optimal hyperparameters found from the inner CV
    optimal_hyperparameters = np.empty(K_outer)

    # Setup storage for model coefficients and errors for each experiment in all inner folds
    ws_inner = np.empty((M + 1, K_outer, K_inner, num_hyperparams))
    train_errors_inner = np.empty((K_outer, K_inner, num_hyperparams))
    test_errors_inner = np.empty((K_outer, K_inner, num_hyperparams))

    # Setup storage for model coefficients for each experiment in all outer folds
    ws_outer = {
        'not regularized': np.empty((M + 1, K_outer)),
        'regularized': np.empty((M + 1, K_outer))
    }
    # Setup storage for errors as a dictionary
    errors_outer = {
        'train': {
            'baseline (no features)': np.empty((K_outer, 1)), 
            'not regularized': np.empty((K_outer, 1)),
            'regularized': np.empty((K_outer, 1))
        },
        'test': {
            'baseline (no features)': np.empty((K_outer, 1)), 
            'not regularized': np.empty((K_outer, 1)),
            'regularized': np.empty((K_outer, 1))
        }
    }
    return optimal_hyperparameters, ws_inner, train_errors_inner, test_errors_inner, ws_outer, errors_outer

def print_regularization_results(errors):
    # Display results
    print("Linear regression without regularization:")
    print(f"- Training error: \t{errors['train']['not regularized'].mean():.4f}")
    print(f"- Test error: \t\t{errors['test']['not regularized'].mean():.4f}")
    print(f"- R^2 train: \t\t{(errors['train']['baseline (no features)'].sum() - errors['train']['not regularized'].sum()) / errors['train']['baseline (no features)'].sum():.4f}")
    print(f"- R^2 test: \t\t{(errors['test']['baseline (no features)'].sum() - errors['test']['not regularized'].sum()) / errors['test']['baseline (no features)'].sum():.4f}\n")

    print("Regularized linear regression:")
    print(f"- Training error: \t{errors['train']['regularized'].mean():.4f}")
    print(f"- Test error: \t\t{errors['test']['regularized'].mean():.4f}")
    print(f"- R^2 train: \t\t{(errors['train']['baseline (no features)'].sum() - errors['train']['regularized'].sum()) / errors['train']['baseline (no features)'].sum():.4f}")
    print(f"- R^2 test: \t\t{(errors['test']['baseline (no features)'].sum() - errors['test']['regularized'].sum()) / errors['test']['baseline (no features)'].sum():.4f}\n")

def get_grid_points(x_min, x_max, y_min, y_max, delta=5e-3):
    # Create a grid of points with the specified resolution
    xx = np.arange(x_min, x_max, delta)
    yy = np.arange(y_min, y_max, delta)
    # Make a mesh-grid that spans the grid-range defined
    grid = np.stack(np.meshgrid(xx, yy))
    return grid, xx, yy

def plot_decision_boundary(predict_function, X, threshold=None, ax=None, fig=None, cmap='RdBu_r'):
    # Set grid range based on the data
    grid_range = [X[:, 0].min(), X[:, 0].max(), X[:, 1].min(), X[:, 1].max()]  # [x_min, x_max, y_min, y_max]
    # Add 10% margin to the grid range to ensure points on the edge are included
    margin_x = 0.1 * (grid_range[1] - grid_range[0])
    margin_y = 0.1 * (grid_range[3] - grid_range[2])
    grid_range[0] -= margin_x
    grid_range[1] += margin_x
    grid_range[2] -= margin_y
    grid_range[3] += margin_y

    # Get grid points
    grid, xx, yy = get_grid_points(*grid_range, delta=5e-3)
    # Reshape grid to a list of points
    grid_points = np.reshape(grid, (2, -1)).T

    # Compute model predictions on the grid points (i.e. the probability of class 1)
    grid_predictions = predict_function(grid_points)

    # Reshape the predictions back to the grid shape
    decision_boundary = np.reshape(grid_predictions, (len(yy), len(xx)))

    # Plot the decision boundary
    img = ax.imshow(decision_boundary, extent=grid_range, origin='lower', cmap=cmap, alpha=0.5, vmin=0, vmax=1)
    fig.colorbar(img, ax=ax)
    if threshold is not None:
        ax.contour(grid[0], grid[1], decision_boundary, levels=[threshold], colors='k')
    ax.grid(False)
    ax.set_aspect('auto')

In [37]:
# Initial data clean
df = pd.read_csv('data/SAHeart.csv')

# Encode Categorical values
df['chd'] = pd.Categorical(df['chd'])
df['famhist'] = df['famhist'].map({'Absent':0, 'Present':1}) #pd.Categorical(df['famhist'].map({'Absent':0, 'Present':1})) # One-of-K coding
# Filter outliers if we need to??
#df = df[~( (df[''] > 2) | (df[''] > 1) )]

df.head()

Unnamed: 0,row.names,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,1,160,12.0,5.73,23.11,1,49,25.3,97.2,52,1
1,2,144,0.01,4.41,28.61,0,55,28.87,2.06,63,1
2,3,118,0.08,3.48,32.28,1,52,29.14,3.81,46,0
3,4,170,7.5,6.41,38.03,1,51,31.99,24.26,58,1
4,5,134,13.6,3.5,27.78,1,60,25.99,57.34,49,1


In [45]:
# What to predict for Regression
X = df.drop(columns=['ldl', 'chd']).values
y = df['ldl'].values

N, M = X.shape


## 2 - Regularize

In [46]:
# Set random seed for reproducibility
np.random.seed(1234)

# Create crossvalidation partition for model evaluation
K_outer = 5
CV_outer = KFold(K_outer, shuffle=True)

# Create crossvalidation partition for hyperparameter tuning
K_inner = 10
CV_inner = KFold(K_inner, shuffle=True)

# Values of regularization parameter lambda to test in the inner loop
lambdas = np.logspace(-5, 8, 14)

# Setup storage for the experiment
optimal_regularization_strengths, ws_inner, train_errors_inner, test_errors_inner, ws_outer, errors_outer = setup_storage_for_experiment(K_outer, K_inner, len(lambdas))

# Run two-layer cross-validation
for outer_fold_idx, (outer_train_index, outer_test_index) in enumerate(CV_outer.split(X, y)):
    # Extract training and test set for the current outer CV fold
    X_train_outer, y_train_outer = X[outer_train_index], y[outer_train_index]
    X_test_outer, y_test_outer = X[outer_test_index], y[outer_test_index]

    # Loop over inner cross-validation folds
    for inner_fold_idx, (inner_train_index, inner_test_index) in enumerate(CV_inner.split(X_train_outer, y_train_outer)):

        # Extract training and validation set for current inner CV fold
        X_train_inner, y_train_inner = X_train_outer[inner_train_index], y_train_outer[inner_train_index]
        X_test_inner, y_test_inner = X_train_outer[inner_test_index], y_train_outer[inner_test_index]

        # Compute the mean and standard deviation of the inner training data split, then standardize training and test sets
        ### BEGIN SOLUTION
        # Compute the mean and standard deviation of the inner training data split
        mu_inner = np.mean(X_train_inner, axis=0)
        sigma_inner = np.std(X_train_inner, axis=0)
        
        # Standardize the inner training set and validation set
        X_train_inner = (X_train_inner - mu_inner) / sigma_inner
        X_test_inner = (X_test_inner - mu_inner) / sigma_inner
        ### END SOLUTION

        # Loop over all values of lambda
        for lambda_idx, regularization_strength in enumerate(lambdas):

            # Create and fit the model
            ### BEGIN SOLUTION
            model = Ridge(alpha=regularization_strength)
            model.fit(X_train_inner, y_train_inner)
            ### END SOLUTION

            # Store the model coefficients for each value of lambda in the inner folds
            ws_inner[:, outer_fold_idx, inner_fold_idx, lambda_idx] = [model.intercept_] + model.coef_.flatten().tolist()

            # Compute and store the training and validation error
            train_errors_inner[outer_fold_idx, inner_fold_idx, lambda_idx] = np.mean((y_train_inner - model.predict(X_train_inner))**2, axis=0)
            test_errors_inner[outer_fold_idx, inner_fold_idx, lambda_idx] = np.mean((y_test_inner - model.predict(X_test_inner))**2, axis=0)
    
    # Determine the optimal value of lambda that gives the lowest test error on average from the inner folds
    ### BEGIN SOLUTION
    optimal_hyperparameter_idx = np.argmin(np.mean(test_errors_inner[outer_fold_idx], axis=0))
    optimal_hyperparameter = lambdas[optimal_hyperparameter_idx]
    ### END SOLUTION

    # Store the optimal regularization strength for the current outer fold
    optimal_regularization_strengths[outer_fold_idx] = optimal_hyperparameter

    # Compute the mean and standard deviation of the outer training data split, then standardize the training and test sets
    ### BEGIN SOLUTION
    # Compute the mean and standard deviation of the outer training data split
    mu_outer = np.mean(X_train_outer, axis=0)
    sigma_outer = np.std(X_train_outer, axis=0)

    # Standardize the outer training set and test set
    X_train_outer = (X_train_outer - mu_outer) / sigma_outer
    X_test_outer = (X_test_outer - mu_outer) / sigma_outer
    ### END SOLUTION

    # Create and fit the model with the optimal lambda on the entire outer training set
    ### BEGIN SOLUTION
    model = Ridge(alpha=optimal_hyperparameter)
    model.fit(X_train_outer, y_train_outer)
    ### END SOLUTION

    # Store the model coefficients for the regularized model
    ws_outer['regularized'][:, outer_fold_idx] = [model.intercept_] + model.coef_.flatten().tolist()
    # Compute and store the training and test error for the regularized model
    errors_outer['train']['regularized'][outer_fold_idx] = np.mean((y_train_outer - model.predict(X_train_outer))**2, axis=0)
    errors_outer['test']['regularized'][outer_fold_idx] = np.mean((y_test_outer - model.predict(X_test_outer))**2, axis=0)


    # Create and fit a model without regularization on the entire outer training set, for comparison
    model = LinearRegression()
    model.fit(X_train_outer, y_train_outer)

    # Store the model coefficients for the model without regularization
    ws_outer['not regularized'][:, outer_fold_idx] = [model.intercept_] + model.coef_.flatten().tolist()
    # Compute and store the training and test error for the model without regularization
    errors_outer['train']['not regularized'][outer_fold_idx] = np.mean((y_train_outer - model.predict(X_train_outer))**2, axis=0)
    errors_outer['test']['not regularized'][outer_fold_idx] = np.mean((y_test_outer - model.predict(X_test_outer))**2, axis=0)

    # Compute mean squared error for the baseline, i.e. without using the input data at all
    errors_outer['train']['baseline (no features)'][outer_fold_idx] = np.mean((y_train_outer - y_train_outer.mean())**2, axis=0)
    errors_outer['test']['baseline (no features)'][outer_fold_idx] = np.mean((y_test_outer - y_test_outer.mean())**2, axis=0)

# Print results
print_regularization_results(errors_outer)

Linear regression without regularization:
- Training error: 	3.3238
- Test error: 		3.5264
- R^2 train: 		0.2226
- R^2 test: 		0.1644

Regularized linear regression:
- Training error: 	3.3572
- Test error: 		3.5088
- R^2 train: 		0.2148
- R^2 test: 		0.1686

