In [28]:
import numpy as np
import pandas as pd 


In [29]:
rg = np.random.default_rng(20201201)
p = 12
t = 250
x_values = rg.standard_normal((t,p)) + rg.standard_normal((t,1))
x = pd.DataFrame(x_values, columns=[f"x{i}" for i in range(1,p+1)])
beta = np.linspace(0.01, 0.10, p)
print(beta)
y = x @ beta + rg.standard_normal(t)


[0.01       0.01818182 0.02636364 0.03454545 0.04272727 0.05090909
 0.05909091 0.06727273 0.07545455 0.08363636 0.09181818 0.1       ]


In [30]:
from itertools import combinations

for i in range(1, p+1):
    count = 0
    for comb in combinations(x.columns, i):
        print(comb)
        count += 1
        if count > 1:
            break


('x1',)
('x2',)
('x1', 'x2')
('x1', 'x3')
('x1', 'x2', 'x3')
('x1', 'x2', 'x4')
('x1', 'x2', 'x3', 'x4')
('x1', 'x2', 'x3', 'x5')
('x1', 'x2', 'x3', 'x4', 'x5')
('x1', 'x2', 'x3', 'x4', 'x6')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6')
('x1', 'x2', 'x3', 'x4', 'x5', 'x7')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x8')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x9')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x10')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x11')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x12')
('x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12')


In [31]:
from numpy.linalg import lstsq

best_models = {}
for i in range(1, p+1):
    best_sse = np.inf
    count = 0
    for comb in combinations(x.columns, i):
        if count > 1:
            break
        reg = x[list(comb)]
        beta = lstsq(reg, y, rcond=None)[0]
        resid = y - reg@beta
        sse = resid@resid
        if sse < best_sse:
            best_sse = sse
            best_models[i] = list(comb)
pd.Series(best_models)

1                                                  [x8]
2                                             [x8, x11]
3                                        [x8, x11, x12]
4                                   [x8, x10, x11, x12]
5                               [x2, x8, x10, x11, x12]
6                           [x2, x5, x8, x10, x11, x12]
7                       [x1, x2, x5, x8, x10, x11, x12]
8                   [x1, x2, x4, x5, x8, x10, x11, x12]
9               [x1, x2, x4, x5, x8, x9, x10, x11, x12]
10          [x1, x2, x4, x5, x6, x8, x9, x10, x11, x12]
11      [x1, x2, x3, x4, x5, x6, x8, x9, x10, x11, x12]
12    [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,...
dtype: object

In [32]:
def xval_5fold(y, x, random=False,seed=20201231):
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        rg = np.random.default_rng(seed)
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    block = n / 5.0
    sse = 0.0
    for i in range(5):
        st = int(np.round(i*block))
        en = int(np.round((i+1)*block))
        leave_out = np.r_[st:en]
        include = np.setdiff1d(np.arange(n), leave_out)
        beta = lstsq(x[include], y[include], rcond=None)[0]
        resid = y[st:en] - x[st:en]@beta
        sse += resid@resid
    return sse


xval_5fold(y, x[best_models[2]])

262.6370197265028

In [33]:
xval_5fold(y, x[best_models[2]], random=True)

256.19356700019455

In [34]:
sse_xv = {}
for k in best_models:
    sse_xv[k] = xval_5fold(y, x[best_models[k]])
sse_xv = pd.Series(sse_xv)

print(f"The minimum k is {sse_xv.idxmin()}")
print(f"The selected model uses {best_models[sse_xv.idxmin()]}")
sse_xv

The minimum k is 4
The selected model uses ['x8', 'x10', 'x11', 'x12']


1     281.207297
2     262.637020
3     256.749977
4     256.076389
5     258.494733
6     258.724487
7     260.809108
8     261.440586
9     263.639740
10    264.776211
11    266.793541
12    269.867570
dtype: float64

In [35]:
sse_xv = {}
for k in best_models:
    sse_xv[k] = xval_5fold(y, x[best_models[k]], random=True)
sse_xv = pd.Series(sse_xv)
print(f"The minimum k is {sse_xv.idxmin()}")
print(f"The selected model uses {best_models[sse_xv.idxmin()]}")
sse_xv

The minimum k is 4
The selected model uses ['x8', 'x10', 'x11', 'x12']


1     274.022927
2     256.193567
3     252.000777
4     250.933807
5     251.097401
6     251.149678
7     251.447727
8     253.004921
9     255.456716
10    257.813521
11    258.619432
12    260.983646
dtype: float64

In [36]:
included = []
forward_models = {}
p = x.shape[1]
for i in range(p):
    excluded = [col for col in x if col not in included]
    best_sse = np.inf
    for col in excluded:
        try_x = x[included + [col]]
        beta = lstsq(try_x, y, rcond=None)[0]
        resid = y - try_x @ beta
        sse = resid @ resid
        if sse < best_sse:
            best_sse = sse
            next_var = col
    included.append(next_var)
print(included)


['x8', 'x11', 'x12', 'x10', 'x2', 'x5', 'x1', 'x4', 'x9', 'x6', 'x3', 'x7']


In [37]:
fsr_sse_sv = {}
for i in range(1, p+1):
    fsr_sse_sv[i] = xval_5fold(y, x[included[:i]])
fsr_sse_sv = pd.Series(fsr_sse_sv)
print(f"The minimum k is {fsr_sse_sv.idxmin()}")
print(f"The selected variables are {included[:fsr_sse_sv.idxmin()]}")
fsr_sse_sv

The minimum k is 4
The selected variables are ['x8', 'x11', 'x12', 'x10']


1     281.207297
2     262.637020
3     256.749977
4     256.076389
5     258.494733
6     258.724487
7     260.809108
8     261.440586
9     263.639740
10    264.776211
11    266.793541
12    269.867570
dtype: float64

In [38]:
included = list(x.columns)
removed = []
backward_models = {}
p = x.shape[1]
for i in range(p):
    excluded = [col for col in x if col not in included]
    best_sse = np.inf
    for col in included:
        try_col = included[:]
        try_col.remove(col)
        try_x = x[try_col]
        beta = lstsq(try_x, y, rcond=None)[0]
        resid = y - try_x @ beta
        sse = resid @ resid
        if sse < best_sse:
            best_sse = sse
            next_drop = col
    removed.append(next_drop)
    included.remove(next_drop)
print(removed)


['x7', 'x3', 'x6', 'x9', 'x4', 'x1', 'x5', 'x2', 'x10', 'x12', 'x11', 'x8']


In [39]:
included = removed[::-1]

bsr_sse_sv = {}
for i in range(1, p+1):
    bsr_sse_sv[i] = xval_5fold(y, x[included[:i]])
bsr_sse_sv = pd.Series(bsr_sse_sv)
print(f"The minimum k is {bsr_sse_sv.idxmin()}")
print(f"The selected variables are {included[:bsr_sse_sv.idxmin()]}")
bsr_sse_sv

The minimum k is 4
The selected variables are ['x8', 'x11', 'x12', 'x10']


1     281.207297
2     262.637020
3     256.749977
4     256.076389
5     258.494733
6     258.724487
7     260.809108
8     261.440586
9     263.639740
10    264.776211
11    266.793541
12    269.867570
dtype: float64

## Using scikit-learn to cross-validate

In [43]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)

bsr_mse_sv = {}
for i in range(1, p+1):
    bsr_mse_sv[i] = cross_val_score(lr, x[included[:i]], y, scoring="neg_mean_squared_error").mean()
bsr_mse_sv = pd.Series(bsr_mse_sv)
print(f"The maximum k is {bsr_mse_sv.idxmax()}")
print(f"The selected variables are {included[:bsr_mse_sv.idxmax()]}")
bsr_mse_sv

The maximum k is 4
The selected variables are ['x8', 'x11', 'x12', 'x10']


1    -1.124829
2    -1.050548
3    -1.027000
4    -1.024306
5    -1.033979
6    -1.034898
7    -1.043236
8    -1.045762
9    -1.054559
10   -1.059105
11   -1.067174
12   -1.079470
dtype: float64

In [41]:
from sklearn.model_selection import KFold
lr = LinearRegression(fit_intercept=False)

bsr_mse_sv = {}
cv = KFold(5, shuffle=True, random_state=20201231)

for i in range(1, p+1):
    bsr_mse_sv[i] = cross_val_score(lr, x[included[:i]], y, scoring="neg_mean_squared_error", cv=cv).mean()
bsr_mse_sv = pd.Series(bsr_mse_sv)
print(f"The maximum k is {bsr_mse_sv.idxmax()}")
print(f"The selected variables are {included[:bsr_mse_sv.idxmax()]}")


The maximum k is 3
The selected variables are ['x8', 'x11', 'x12']
