In [182]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import GridSearchCV

prng = np.random.RandomState(20250310)

%precision 3
pd.set_option('display.precision', 3)

# Illustrating bias-variance trade-off: Univariate vs multivariate model

## Our problem

Let"s start with a simple linear model:

$$
Y = f(X) + \varepsilon = \beta_1 X_1 + \beta_2 X_2 + \varepsilon
$$

The true model is:

$$
f(X) = X_1 + X_2
$$

In [183]:
def trueModel(x1, x2):
    y = x1 + x2
    return y

def generateData(prng, sample_size):
    features = [prng.uniform(0, 1, size=sample_size) for _ in range(3)]
    y_true = trueModel(features[0], features[1])
    y = y_true + prng.normal(0, 2, size=sample_size) # sigma(epsilon) = 2
 
    feature_df = pd.DataFrame({
        'x1': features[0],
        'x2': features[1]
    })

    return feature_df, y


In [None]:
# Evaluate the model's performance at a single point: X=0.5
test_data = pd.DataFrame({'x1': [0], 'x2': [0]})
trueModel(test_data['x1'], test_data['x2'])[0]

## Estimate two models: Linear Regression and Lasso

Lasso recap: optimize for the sum of squared residuals (like simple OLS) + a penalty term (= sum of the absolute values of the coefficients)

**TODO**:

- Extend the code below to estimate two models: a simple linear regression and a Lasso
- As we won't do any feature engineering here, there is no need for `Pipeline`
- For Lasso, use the default penalty parameter but control the random state with our pseudo-random number generator (`prng`)
- Get predictions for the (0, 0) point for both models (you can use the `test_data`)

In [None]:
sample_size = 20

X, y = generateData(prng, sample_size)

lm = # TBA
lasso = # TBA

# TBA

print("Linear model prediction: ", # TBA)
print("Lasso prediction: ", # TBA)

## Monte Carlo simulation

**TODO**:

- Extend the code below to run a Monte Carlo simulation
- For each realization of data, we estimate 1+20 models: a simple linear regression and 20 different Lasso models with various penalty parameters (collected in `alphas_to_try`)

In [186]:
# Monte Carlo simulation

n_iterations = 1000
alphas_to_try = np.linspace(0.01, 0.5, num=20)

lm_predictions = np.empty(n_iterations)
lasso_predictions = np.empty((n_iterations, len(alphas_to_try)))
lasso_n_coeffs = np.empty((n_iterations, len(alphas_to_try)))

# Perform the Monte Carlo simulation
for i in range(n_iterations):

    X, y = generateData(prng, sample_size)

    # TBA
    lm_predictions[i] = # TBA

    for ida, a in enumerate(alphas_to_try):
        # TBA
        lasso_predictions[i, ida] = # TBA
        lasso_n_coeffs[i, ida] = np.count_nonzero(# TBA)  # we would like to count the non-zero coefficients

Linear Regression could be interpreted as a special case of Lasso with alpha = 0. Let's concatenate the predictions of the two models. (*Note that `lm_predictions` is a 1D array so first we need to reshape it to a 2D array with `reshape(-1, 1)`.*)

In [None]:
predictions = np.concatenate([lm_predictions.reshape(-1, 1), lasso_predictions], axis=1)
predictions.shape

In [188]:
alphas = np.concatenate([[0], alphas_to_try])
biases = np.mean(predictions - trueModel(test_data['x1'], test_data['x2'])[0], axis=0)
variances = np.var(predictions, axis=0)
mses = np.mean(np.square(predictions - trueModel(test_data['x1'], test_data['x2'])[0]), axis=0)

In [None]:
plt.plot(alphas, biases**2, label='Bias^2')
plt.plot(alphas, variances, label='Variance')
plt.plot(alphas, mses, label='MSE')
plt.xlabel('Penalty parameter')
plt.legend()
plt.show()

Penalized regression performs better than the true (unpenalized) model!

In [None]:
avg_num_coeffs = np.concatenate([[2], np.mean(lasso_n_coeffs, axis=0)])
plt.plot(alphas, avg_num_coeffs, label='Avg number of non-zero coefficients')
plt.xlabel('Penalty parameter')
plt.title('Variable selection in Lasso by the penalty parameter')
plt.legend()
plt.show()

## Finding the best penalty parameter with cross-validation (hyperparameter tuning)

**TODO**:

- Extend the code below to run a grid search over the alpha values using `GridSearchCV()` (~ repeat the same exerice as we did above but with cross-validation on the same dataset)

In [None]:
X, y = generateData(prng, sample_size)
lasso_cv = # TBA

In [None]:
lasso_cv.best_params_

In [None]:
lasso_cv.predict(X)

In [None]:
print(f"Coefficients of the best model: beta_1={lasso_cv.best_estimator_.coef_[0]}, beta_2={lasso_cv.best_estimator_.coef_[1]}")
print(f"Intercept of the best model: {lasso_cv.best_estimator_.intercept_:.3f}")


In [None]:
# Extract alpha values and scores for all 10 folds
alphas = lasso_cv.cv_results_['param_alpha'].data
mean_scores = -lasso_cv.cv_results_['mean_test_score']

plt.figure(figsize=(12, 7))

# Plot individual fold scores
for i in range(10):  # For all 10 folds
    fold_key = f'split{i}_test_score'
    fold_scores = -lasso_cv.cv_results_[fold_key]  # Negate to get MSE
    plt.plot(alphas, fold_scores, 'o-', alpha=0.4, linewidth=1, label=f'Fold {i+1}')

# Plot the mean score with heavier line
plt.plot(alphas, mean_scores, 'o-', linewidth=3, color='black', label='Mean')

# Mark the best alpha
best_alpha = lasso_cv.best_params_['alpha']
plt.axvline(x=best_alpha, color='darkred', linestyle='--', label=f'Best alpha: {best_alpha}')

plt.xlabel("Penalty parameter")
plt.ylabel("Mean Squared Error (MSE)")
plt.title("Cross-validation scores across all 10 folds")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Here we can only cook from the given data (n=20). In the previous example, we repeated the experiment 1000 times, allowing us to measure the average performance of the models across many possible datasets (so we basically cooked from 1000x20 data points). In contrast, cross-validation works with only the single available dataset, which better reflects the real life situation where we must estimate performance using only the data at hand.