## Model Selection Demo:

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from itertools import combinations

In [3]:
# Simulate data
np.random.seed(0)
n = 100
p = 5
X = np.random.randn(n, p)
beta = np.array([3, 2, 0, 0, 0])  # Only two variables are nonzero
y = X @ beta + np.random.randn(n) * 0.25

# Add a constant to X for intercept
X = sm.add_constant(X)



In [4]:
# Best subset selection
def best_subset_selection(X, y):
    n, p = X.shape
    models = []
    
    for k in range(1, p + 1):  # Iterate over subset sizes
        for combo in combinations(range(1, p), k):  # Generate combinations of predictors
            combo = (0,) + combo  # Include the intercept
            X_subset = X[:, combo]
            model = sm.OLS(y, X_subset).fit()
            models.append((model, combo))
    
    return models


In [5]:

# Calculate metrics
def calculate_metrics(model, X, y):
    n = len(y)
    k = model.df_model  # Number of predictors, excluding intercept


    
    # AIC
    aic = model.aic
    
    # BIC
    bic = model.bic
    
    
    # PRESS (Prediction Sum of Squares)
    hat_matrix = X @ np.linalg.inv(X.T @ X) @ X.T
    residuals = model.resid
    press = np.sum((residuals / (1 - np.diag(hat_matrix))) ** 2)
    
    # Adjusted R-squared
    r2 = model.rsquared
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k)
    
    return aic, bic, press, adj_r2, int(k) #dont consider intercept as a predictor



In [6]:
# Run best subset selection
models = best_subset_selection(X, y)

In [7]:
# Store results in pd DataFrame
results = []
for model, combo in models:
    aic, bic, press, adj_r2, num_predictors = calculate_metrics(model, X[:, combo], y)
    results.append({
        'Predictors': combo,
        'n_Predictors': num_predictors,
        'AIC': aic,
        'BIC': bic,
        'PRESS': press,
        'Adjusted R^2': adj_r2
    })

# Convert results to pd DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='n_Predictors').reset_index(drop=True)


In [8]:
# Display our results
pd.set_option('display.max_columns', None)  # Show all columns
results_df #0 represents intercept – all models include intercept

Unnamed: 0,Predictors,n_Predictors,AIC,BIC,PRESS,Adjusted R^2
0,"(0, 1)",1,425.816975,431.027315,413.115577,0.683503
1,"(0, 2)",1,512.08266,517.293,978.576806,0.250078
2,"(0, 3)",1,539.584614,544.794955,1291.751212,0.012685
3,"(0, 4)",1,540.167897,545.378237,1305.152146,0.006909
4,"(0, 5)",1,540.846176,546.056517,1313.870858,0.00015
5,"(0, 4, 5)",2,542.117139,549.93265,1338.387716,-0.002715
6,"(0, 3, 5)",2,541.579888,549.395398,1323.6626,0.002657
7,"(0, 3, 4)",2,540.865588,548.681098,1317.157514,0.009756
8,"(0, 2, 5)",2,513.14799,520.9635,997.590225,0.249473
9,"(0, 2, 4)",2,513.694881,521.510392,1001.707161,0.245357
