## Part 3: Feature Selection

### Remember our goal is to predict the house price based on distinct features

- We will use the sklearn library to select the best features
- Dataset downloaded from: [House Data](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Feature selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Visualize all columns in the dataframe
pd.set_option('display.max_columns', None)


In [2]:
# Read the training set
dataset = pd.read_csv('X_train.csv')
dataset.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.5,0.036765,0.098361,0.0,0.0,1.0,1.0,0.666667,0.1225,0.666667,1.0,1.0,0.75,0.75,0.25,1.0,0.125089,0.833333,0.0,0.064212,0.140098,1.0,1.0,1.0,1.0,0.356155,0.413559,0.0,0.577712,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.5,1.0,0.0,0.2,0.8,0.046729,0.666667,0.5,0.38646,0.666667,1.0,1.0,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,1.0,0.5,0.0,0.5,0.2,1.0,0.75,0.6,0.555556,0.875,0.227941,0.52459,0.0,0.0,0.4,0.3,0.333333,0.0,0.333333,1.0,0.5,0.75,0.75,1.0,0.666667,0.173281,0.833333,0.0,0.121575,0.206547,1.0,1.0,1.0,1.0,0.503056,0.0,0.0,0.470245,0.0,0.5,0.666667,0.0,0.375,0.333333,0.333333,0.333333,1.0,0.333333,0.6,0.8,0.28972,0.666667,0.5,0.324401,0.666667,1.0,1.0,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,1.0,0.0,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.5,0.051471,0.114754,0.0,0.0,1.0,1.0,0.666667,0.10125,0.666667,1.0,1.0,0.75,0.75,0.5,1.0,0.086109,0.833333,0.0,0.185788,0.150573,1.0,1.0,1.0,1.0,0.383441,0.41937,0.0,0.593095,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.333333,1.0,0.333333,0.6,0.8,0.065421,0.666667,0.5,0.428773,0.666667,1.0,1.0,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,1.0,0.25,0.0,0.727273,0.4,1.0,0.75,1.0,0.666667,0.5,0.669118,0.606557,0.0,0.0,0.2,0.4,0.333333,0.0,0.333333,1.0,0.25,0.5,1.0,0.25,0.666667,0.038271,0.833333,0.0,0.231164,0.123732,1.0,0.75,1.0,1.0,0.399941,0.366102,0.0,0.579157,0.333333,0.0,0.333333,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.333333,0.8,0.4,0.074766,0.333333,0.75,0.45275,0.666667,1.0,1.0,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,1.0,0.5,0.0,1.0,0.4,1.0,0.75,1.0,0.777778,0.5,0.058824,0.147541,0.0,0.0,1.0,1.0,0.666667,0.21875,0.666667,1.0,1.0,0.75,0.75,0.75,1.0,0.116052,0.833333,0.0,0.20976,0.187398,1.0,1.0,1.0,1.0,0.466237,0.509927,0.0,0.666523,0.333333,0.0,0.666667,0.5,0.5,0.333333,0.666667,0.583333,1.0,0.333333,0.6,0.8,0.074766,0.666667,0.75,0.589563,0.666667,1.0,1.0,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


In [3]:
# First we need to capture the dependent feature 'SalePrice' and drop it from the training set
# We also drop 'Id' since it is not useful

y_train = dataset[['SalePrice']]
X_train = dataset.drop(['SalePrice', 'Id'], axis=1)

In [None]:
# We will apply feature selection
# We specify a Lasso Regression Model and select a suitable alpha
# Remember alpha will impact on the number of features selected (The higher the alpha, the less features selected)

# After that, we proceed to use the SelectFromModel object which will select the features which coefficients are non-zero
# The random_state can be anything, it is the seed, but keep in mind it has to be the same as with the test data set

feature_selection_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
feature_selection_model.fit(X_train, y_train)

In [None]:
# True are the important features, while False are not that important
feature_selection_model.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [None]:
# Now we will print the number of total and selected features

# Create a list of the selected features
selected_features = X_train.columns[(feature_selection_model.get_support())]

# This is why feature selection is very important, we can see that from 82 features, we are able to dismiss 61 of them

# Print some useful information
print(f'Total number of features: {X_train.shape[1]}')
print(f'Selected features: {len(selected_features)}')
print(f'Features with coefficients shrank to zero: {np.sum(feature_selection_model.estimator_.coef_ == 0)}')

Total number of features: 82
Selected features: 21
Features with coefficients shrank to zero: 61


In [9]:
# These are the important features
selected_features

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'YearRemodAdd',
       'RoofStyle', 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir',
       '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
       'SaleCondition'],
      dtype='object')

In [12]:
# This is my new training set that will be used for the model training

X_train = X_train[selected_features]
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,0.235294,0.75,0.636364,0.666667,0.098361,0.0,0.75,0.25,1.0,1.0,0.356155,0.577712,0.333333,0.666667,0.0,0.2,0.8,0.666667,0.5,1.0,0.75
1,0.0,0.75,0.5,0.555556,0.52459,0.0,0.75,1.0,1.0,1.0,0.503056,0.470245,0.0,0.333333,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
2,0.235294,0.75,0.636364,0.666667,0.114754,0.0,0.75,0.5,1.0,1.0,0.383441,0.593095,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
3,0.294118,0.75,0.727273,0.666667,0.606557,0.0,0.5,0.25,0.75,1.0,0.399941,0.579157,0.333333,0.666667,0.333333,0.8,0.4,0.333333,0.75,1.0,0.0
4,0.235294,0.75,1.0,0.777778,0.147541,0.0,0.75,0.75,1.0,1.0,0.466237,0.666523,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.75,1.0,0.75
