In [1]:
import pandas as pd

In [2]:
d = pd.read_csv('auto-mpg.csv')

In [3]:
X = d.drop('mpg', axis='columns')
y = d.mpg

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [5]:
model = make_pipeline(StandardScaler(), LinearRegression())

In [6]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [11]:
from itertools import combinations
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy.typing import ArrayLike
from typing import List

In [90]:
class Solution:
    features: List[int]
    mse: float

    def __init__(self, y: ArrayLike):
        self.features = list()
        self.mse = mean_squared_error(y, [y.mean()]*len(y))

    def update(self, features: List[int], mse: float) -> bool:
        if(mse < self.mse):
            print(', '.join(str(x) for x in features))
            print(f"\tNew Error: {mse:.3f} better than {self.mse:.3f}")
            self.features = features
            self.mse = mse
            return True

        return False

## Feature Selection by complete enumeration

In [94]:
best = Solution(y)
features = X.columns
features

Index(['cylinders', 'displacement', 'hp', 'weight', 'acceleration', 'year',
       'origin'],
      dtype='object')

In [95]:
for n_features in range(1,len(features)+1):
    subsets = combinations(features, n_features)

    for subset in subsets:
        Xr = X[list(subset)]
        mses = cross_val_score(estimator=model, X=Xr, y=y, cv=kfold, scoring='neg_mean_squared_error')
        mse = -np.average(mses)
        best.update(features=subset, mse=mse)

cylinders
	New Error: 24.131 better than 60.763
displacement
	New Error: 21.525 better than 24.131
weight
	New Error: 18.833 better than 21.525
cylinders, weight
	New Error: 18.605 better than 18.833
cylinders, year
	New Error: 17.465 better than 18.605
displacement, year
	New Error: 15.979 better than 17.465
weight, year
	New Error: 11.847 better than 15.979
weight, year, origin
	New Error: 11.448 better than 11.847
hp, weight, year, origin
	New Error: 11.439 better than 11.448
displacement, hp, weight, year, origin
	New Error: 11.356 better than 11.439
cylinders, displacement, hp, weight, year, origin
	New Error: 11.352 better than 11.356


In [31]:
print('Selected features: ', end='')
print(', '.join([str(x) for x in best.features]))

Selected features: cylinders, displacement, hp, weight, year, origin


## Forward stepwise selection

In [92]:
all_features = features
current_features = list()
best = Solution(y)

In [93]:
while len(current_features) != len(all_features):
    selected_feature = None

    for feature in set(all_features) - set(current_features):
        new_features = current_features + [feature]
        Xr = X[new_features]
        mses = cross_val_score(estimator=model, X=Xr, y=y, cv=kfold, scoring='neg_mean_squared_error')
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features.append(selected_feature)
    else:
        break

displacement
	New Error: 21.525 better than 60.763
weight
	New Error: 18.833 better than 21.525
weight, cylinders
	New Error: 18.605 better than 18.833
weight, year
	New Error: 11.847 better than 18.605
weight, year, origin
	New Error: 11.448 better than 11.847
weight, year, origin, hp
	New Error: 11.439 better than 11.448
weight, year, origin, hp, displacement
	New Error: 11.356 better than 11.439
weight, year, origin, hp, displacement, cylinders
	New Error: 11.352 better than 11.356


## Backward Stepwise selection

In [88]:
all_features = features
current_features = all_features
best = Solution(y)

In [89]:
while len(current_features)>0:
    selected_feature = None

    for feature in current_features:
        new_features = current_features.drop(feature)
        Xr = X[new_features]
        mses = cross_val_score(estimator=model, X=Xr, y=y, cv=kfold, scoring='neg_mean_squared_error')
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features = current_features.drop(selected_feature)
    else:
        break

displacement, hp, weight, acceleration, year, origin
	New Error: 11.547
cylinders, displacement, weight, acceleration, year, origin
	New Error: 11.506
cylinders, displacement, hp, weight, year, origin
	New Error: 11.352
