## Feature Selection - Advanced House Price Prediction

- The main aim is to predict the house price based on various features

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# For feature selection 
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings('ignore')

In [4]:
dataset = pd.read_csv('x_train.csv')

In [5]:
dataset.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.5,0.418208,0.366344,0.0,0.5,1.0,1.0,...,0.5,0.0,0.0,0.090909,0.5,1.0,0.5,0.0,0.0,0.0
1,2,12.109011,0.0,0.5,0.495064,0.391317,0.0,0.5,1.0,1.0,...,0.5,0.0,0.0,0.363636,0.25,1.0,0.5,0.0,0.0,0.0
2,3,12.317167,0.235294,0.5,0.434909,0.422359,0.0,0.5,0.0,1.0,...,0.5,0.0,0.0,0.727273,0.5,1.0,0.5,0.0,0.0,0.0
3,4,11.849398,0.294118,0.5,0.388581,0.390295,0.0,0.5,0.0,1.0,...,0.5,0.0,0.0,0.090909,0.0,1.0,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.5,0.513123,0.468761,0.0,0.5,0.0,1.0,...,0.5,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0


In [6]:
x_train = dataset.drop(['Id', 'SalePrice'], axis=1) # Independent Feature
y_train = dataset['SalePrice'] # Dependent Feature


- SelectFromModel usually take an algorithm which is used to reduce the features and Lasso is one of them

In [10]:
# Applying feature selection
# 1. Specifying a lasso regression model
# 2. select a suitable aplha value 
# 3. Bigger the alpha value less feature will be selected
# 4. We will select the features which are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
feature_sel_model.fit(x_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)

In [11]:
feature_sel_model.get_support()

#True particulary indicates that feature is important
#False particularly indicates that feature is not that important you can skip it

array([False,  True, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False,  True, False,
       False,  True, False, False, False, False, False, False,  True,
       False, False,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [15]:
selected_feat = x_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((x_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))

total features: 82
selected features: 20


In [16]:
selected_feat

Index(['MSZoning', 'LotShape', 'BldgType', 'OverallQual', 'YearRemodAdd',
       'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC',
       'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual',
       'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive'],
      dtype='object')

In [17]:
x_train[selected_feat].head()

Unnamed: 0,MSZoning,LotShape,BldgType,OverallQual,YearRemodAdd,ExterQual,BsmtQual,BsmtExposure,BsmtFinType1,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,GarageType,GarageFinish,GarageCars,PavedDrive
0,0.5,1.0,0.0,0.666667,0.098361,0.333333,0.5,1.0,0.333333,0.0,1.0,0.356155,0.577712,0.333333,0.666667,0.0,0.0,0.666667,0.5,1.0
1,0.5,1.0,0.0,0.555556,0.52459,1.0,0.5,0.25,0.0,0.0,1.0,0.503056,0.470245,0.0,1.0,0.333333,0.0,0.666667,0.5,1.0
2,0.5,0.0,0.0,0.666667,0.114754,0.333333,0.5,0.75,0.333333,0.0,1.0,0.383441,0.593095,0.333333,0.666667,0.333333,0.0,0.666667,0.5,1.0
3,0.5,0.0,0.0,0.666667,0.606557,1.0,1.0,1.0,0.0,0.5,1.0,0.399941,0.579157,0.333333,0.666667,0.333333,0.6,1.0,0.75,1.0
4,0.5,0.0,0.0,0.777778,0.147541,0.333333,0.5,0.0,0.333333,0.0,1.0,0.466237,0.666523,0.333333,0.666667,0.333333,0.0,0.666667,0.75,1.0


In [18]:
x_train = x_train[selected_feat]