# Feature Selection Example

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from error_metrics import print_regression_error_metrics

# Read in data
cars = pd.read_csv('./data/cars.csv')
data_x = cars[list(cars)[1:]]
data_y = cars['mpg']

# Create training and test splits
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state = 4)

In [15]:
base_model = linear_model.LinearRegression()
base_model.fit(x_train, y_train)
preds = base_model.predict(x_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS for feature set [24.51493293197121, 4.3870307737591006, 0.636872671247911, 0.767292589077751]


Unnamed: 0,Actual,Predicted
20,21.5,25.161462
15,10.4,8.551049
17,32.4,25.465516
2,22.8,24.508777
11,16.4,12.012969
19,33.9,27.86578
16,14.7,7.763519


### Use f-selection (top 25%)

In [16]:
selector_f = SelectPercentile(f_regression, percentile=25)
selector_f.fit(x_train, y_train)
for name, score, pv in zip(list(cars), selector_f.scores_, selector_f.pvalues_):
    print('F-score, p-value, ('+ name + '): '+ str(score)+ ', '+ str(pv))

F-score, p-value, (mpg): 72.60897277172427, 1.4285485646903708e-08
F-score, p-value, (cyl): 52.37926936274743, 2.288405556460013e-07
F-score, p-value, (disp): 28.728860103230335, 1.918656593805583e-05
F-score, p-value, (hp): 14.982418395530182, 0.0007751208908700202
F-score, p-value, (drat): 95.9791939506271, 1.1242991837306483e-09
F-score, p-value, (wt): 3.0574333594941234, 0.09370523006136659
F-score, p-value, (qsec): 12.843088079770348, 0.0015714445870867975
F-score, p-value, (vs): 8.266762931256759, 0.008549929449842912
F-score, p-value, (am): 6.894741767136821, 0.01511038230276851
F-score, p-value, (gear): 5.8645189056937435, 0.023739272527108935


In [17]:
# Get the feature-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Build and test the new model with the selected features
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS for feature set [19.988618207227002, 4.1125419122460265, 0.6673504425122332, 0.7324619718569146]


Unnamed: 0,Actual,Predicted
20,21.5,25.612542
15,10.4,9.764495
17,32.4,26.195267
2,22.8,26.073464
11,16.4,13.10984
19,33.9,27.663263
16,14.7,9.835112


### Use f-selection (top 3 features)

In [18]:
# Create a feature selector to get the top 3 features by the F metric
selector_f = SelectKBest(f_regression, k=3)

# Get the feature-selected data
xt_train, xt_test = selector_f.fit_transform(x_train, y_train), selector_f.transform(x_test)

# Build and test the new model with the selected features
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS for feature set [19.988618207227002, 4.1125419122460265, 0.6673504425122332, 0.7324619718569146]


Unnamed: 0,Actual,Predicted
20,21.5,25.612542
15,10.4,9.764495
17,32.4,26.195267
2,22.8,26.073464
11,16.4,13.10984
19,33.9,27.663263
16,14.7,9.835112


### Use Recursive Feature Elimination with Cross Validaton 

In [20]:
# Build the RFECV selector with 5-fold CV and the R-squared score as the metric
selector_f = RFECV(estimator=linear_model.LinearRegression(), cv=5, scoring=make_scorer(r2_score))
selector_f.fit(x_train, y_train)

# Get the feature-selected data
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Build and test the new model with the selected features
model = linear_model.LinearRegression()
model.fit(xt_train, y_train)
preds = model.predict(xt_test)
print_regression_error_metrics(y_test, preds)
pd.DataFrame({'Actual':y_test, 'Predicted':preds})

MSE, MAE, R^2, EVS for feature set [20.98193480610998, 4.632496505789522, 0.6879483251172415, 0.7495733392019129]


Unnamed: 0,Actual,Predicted
20,21.5,26.132497
15,10.4,8.407512
17,32.4,26.437259
2,22.8,25.950664
11,16.4,14.27784
19,33.9,27.91732
16,14.7,8.727854
