In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import csv

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df_train = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

## Features selection


In [3]:

df_all = df_train.copy()
del df_all['SalePrice']

df_all = df_all.append(df_test, ignore_index=True)

#df_all.describe(include='all')

In [4]:
# Select  all features, but SalePrice
x = df_train.copy()
del x['Id']
del x['SalePrice']

x_test = df_test.copy()
del x_test['Id']

#regressor values
y = df_train['SalePrice']

## Feature selection using:

1 Filter-based techniques
* Pearson correlation
* chi-square correlation

2 Wrapper-based techniques
* recursive feature elimination (RFE)

3 Embedded techniques
* lasso based on linear regression

In [14]:
num_feats = 40
feature_name = x.columns.tolist()

In [15]:
#Pearson correlation
def cor_selector(X, y,num_feats):
    cor_list = []    
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

cor_support, cor_feature = cor_selector(x, y, num_feats)
print(str(len(cor_feature)), 'selected features')


40 selected features


In [16]:
#chi-square correlation
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_norm = MinMaxScaler().fit_transform(x)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = x.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')


40 selected features


In [18]:
#recursive feature elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = x.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Fitting estimator with 80 features.
Fitting estimator with 70 features.
Fitting estimator with 60 features.
Fitting estimator with 50 features.
40 selected features


In [19]:
#lasso based on linear regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = x.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

40 selected features


In [20]:
# put all selection together
feature_selection_df = pd.DataFrame(
    {'Feature':feature_name, 'Pearson':cor_support,
    'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support}
)

# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Total
1,YearRemodAdd,True,True,True,True,4
2,Neighborhood,True,True,True,True,4
3,MasVnrType,True,True,True,True,4
4,MSSubclass_category,True,True,True,True,4
5,KitchenQual,True,True,True,True,4
6,HalfBath,True,True,True,True,4
7,GarageFinish,True,True,True,True,4
8,Foundation,True,True,True,True,4
9,Fireplaces,True,True,True,True,4
10,BsmtFinType1,True,True,True,True,4


In [21]:
feature_selection_df.head(num_feats).Feature.tolist()

['YearRemodAdd',
 'Neighborhood',
 'MasVnrType',
 'MSSubclass_category',
 'KitchenQual',
 'HalfBath',
 'GarageFinish',
 'Foundation',
 'Fireplaces',
 'BsmtFinType1',
 'BsmtExposure',
 '2ndFlrSF',
 'YrSold',
 'SaleCondition',
 'PavedDrive',
 'MSZoning',
 'LotConfig',
 'HouseStyle',
 'HeatingQC',
 'GarageYrBlt',
 'GarageCars',
 'FullBath',
 'FireplaceQu',
 'Fence',
 'Exterior2nd',
 'Exterior1st',
 'Electrical',
 'CentralAir',
 'BsmtQual',
 'YearBuilt',
 'WoodDeckSF',
 'Utilities',
 'Street',
 'RoofMatl',
 'OpenPorchSF',
 'MoSold',
 'MasVnrArea',
 'MSSubClass',
 'LotShape',
 'LandSlope']