# Feature Selection 

This step of the pipeline deals with selecting the appropriate features for our analysis. Feature selection is a vast topic with many methods to select features depending on the type of problem. For the Boston Housing Prices problem, we look at selecting features through a LASSO Regression method. 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [3]:
x_train = pd.read_csv("xtrain.csv")
x_test = pd.read_csv("xtest.csv")

x_train.tail()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1309,764,12.727838,0.487992,0.75,0.504203,0.38782,1.0,1.0,0.0,0.333333,...,0.375,0.0,0.0,0.75,1.0,0.0,0.783092,0.75,0.666667,0.75
1310,836,11.759786,0.0,0.75,0.388581,0.391317,1.0,1.0,0.0,0.333333,...,0.0,0.0,0.0,0.75,1.0,0.0,0.278943,1.0,0.666667,0.75
1311,1217,11.626254,0.668095,0.25,0.434909,0.377157,1.0,1.0,0.0,0.333333,...,0.0,0.0,0.0,0.75,1.0,0.0,0.557886,1.0,0.666667,0.75
1312,560,12.363076,0.795881,0.75,0.388581,0.176055,1.0,1.0,0.0,0.333333,...,0.0,0.0,0.0,0.75,1.0,0.0,0.926628,0.0,0.666667,0.75
1313,685,12.305918,0.487992,0.75,0.376033,0.500493,1.0,1.0,1.0,0.333333,...,0.0,0.0,0.0,0.75,1.0,0.0,0.721057,1.0,0.666667,0.75


In [19]:
y_train = x_train['SalePrice']
y_test = x_test['SalePrice']

x_train.drop(['Id','SalePrice'],axis=1, inplace= True)
x_test.drop(['Id', 'SalePrice'], axis=1, inplace= True)
x_train.shape

(1314, 79)

## Lasso Regression

We now use Lasso regression to select the appropriate features 

In [20]:
select = SelectFromModel(Lasso(alpha = 0.005, random_state=0))
select.fit(x_train,y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [22]:
select.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False,  True, False,  True,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False])

In [31]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = x_train.columns[(select.get_support())]

# let's print some stats
print(f'total features: {(x_train.shape[1])}')
print(f'selected features: {len(selected_feat)}')
print(f'features discarded : {(x_train.shape[1] - len(selected_feat)) }')

total features: 79
selected features: 22
features discarded : 57


In [42]:
selected_feats = x_train.columns[(select.estimator_.coef_!=0).ravel().tolist()]
selected_feats

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'YearRemodAdd',
       'RoofStyle', 'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'PavedDrive'],
      dtype='object')

In [43]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)