# FEATURE SELECTION 2
<hr style="height:1px;border:none;color:#333;background-color:#333;" />
<br>

In [2]:
# import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel, f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso

In [3]:
pd.set_option('display.max_columns', None) # display all the columns of a dataframe
pd.set_option('display.max_rows', 100)     # display 100 rows of a dataframe

In [26]:
# Load cleaned data - see "Data Cleansing.ipynb"
dataset = pd.read_csv('../ariel/Ames_Housing_Price_Data_cleaned.csv')

<br><br>
## Encoding Categorical Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />

In [27]:
# set the following columns to object
dataset['MSSubClass'] = dataset['MSSubClass'].astype(object)
dataset['YearBuilt'] = dataset['YearBuilt'].astype(object)
dataset['YearRemodAdd'] = dataset['YearRemodAdd'].astype(object)
dataset['GarageYrBlt'] = dataset['GarageYrBlt'].astype(object)
dataset['MoSold'] = dataset['MoSold'].astype(object)
dataset['YrSold'] = dataset['YrSold'].astype(object)

In [28]:
# NaN values have a meaning
temp = pd.DataFrame(dataset.isna().sum())
temp[temp[0]>0].sort_values(0, ascending=False)

Unnamed: 0,0
PoolQC,2570
MiscFeature,2482
Alley,2411
Fence,2054
FireplaceQu,1241
GarageType,128
GarageYrBlt,128
GarageFinish,128
GarageQual,128
GarageCond,128


In [29]:
# filling all NaNs with "NA" (string)
dataset.fillna("NA", inplace=True)
# converting "NA" to zero in the GarageYrBlt column
dataset['GarageYrBlt'][dataset['GarageYrBlt']=="NA"] = 0

In [100]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2579 entries, 0 to 2578
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GrLivArea      2579 non-null   int64  
 1   SalePrice      2579 non-null   int64  
 2   MSSubClass     2579 non-null   int64  
 3   MSZoning       2579 non-null   object 
 4   LotFrontage    2579 non-null   float64
 5   LotArea        2579 non-null   int64  
 6   Street         2579 non-null   object 
 7   Alley          2579 non-null   object 
 8   LotShape       2579 non-null   object 
 9   LandContour    2579 non-null   object 
 10  Utilities      2579 non-null   object 
 11  LotConfig      2579 non-null   object 
 12  LandSlope      2579 non-null   object 
 13  Neighborhood   2579 non-null   object 
 14  Condition1     2579 non-null   object 
 15  Condition2     2579 non-null   object 
 16  BldgType       2579 non-null   object 
 17  HouseStyle     2579 non-null   object 
 18  OverallQ

In [121]:
dataset_encoded = dataset.copy()

In [122]:
# List of categorical columns
object_columns = dataset_encoded.select_dtypes("object").columns

In [123]:
# Encode all categorical variables
for column_name in dataset_encoded.select_dtypes(include="object").columns:
    dataset_encoded = pd.concat([dataset_encoded, pd.get_dummies(dataset_encoded[column_name], prefix=column_name, drop_first=True)], axis=1)

In [124]:
# Remove original categorical columns
dataset_encoded = dataset_encoded.drop(object_columns, axis=1)

In [125]:
dataset_encoded

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_I (all),MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_NA,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSewr,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,Neighborhood_GrnHill,Neighborhood_IDOTRR,Neighborhood_Landmrk,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_PreCast,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_PreCast,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_NA,BsmtQual_Po,BsmtQual_TA,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_NA,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_NA,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NA,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NA,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_Po,KitchenQual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sal,Functional_Typ,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NA,FireplaceQu_Po,FireplaceQu_TA,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NA,GarageYrBlt_1895.0,GarageYrBlt_1900.0,GarageYrBlt_1906.0,GarageYrBlt_1908.0,GarageYrBlt_1910.0,GarageYrBlt_1914.0,GarageYrBlt_1915.0,GarageYrBlt_1916.0,GarageYrBlt_1917.0,GarageYrBlt_1918.0,GarageYrBlt_1920.0,GarageYrBlt_1921.0,GarageYrBlt_1922.0,GarageYrBlt_1923.0,GarageYrBlt_1924.0,GarageYrBlt_1925.0,GarageYrBlt_1926.0,GarageYrBlt_1927.0,GarageYrBlt_1928.0,GarageYrBlt_1929.0,GarageYrBlt_1930.0,GarageYrBlt_1931.0,GarageYrBlt_1932.0,GarageYrBlt_1933.0,GarageYrBlt_1934.0,GarageYrBlt_1935.0,GarageYrBlt_1936.0,GarageYrBlt_1937.0,GarageYrBlt_1938.0,GarageYrBlt_1939.0,GarageYrBlt_1940.0,GarageYrBlt_1941.0,GarageYrBlt_1942.0,GarageYrBlt_1943.0,GarageYrBlt_1945.0,GarageYrBlt_1946.0,GarageYrBlt_1947.0,GarageYrBlt_1948.0,GarageYrBlt_1949.0,GarageYrBlt_1950.0,GarageYrBlt_1951.0,GarageYrBlt_1952.0,GarageYrBlt_1953.0,GarageYrBlt_1954.0,GarageYrBlt_1955.0,GarageYrBlt_1956.0,GarageYrBlt_1957.0,GarageYrBlt_1958.0,GarageYrBlt_1959.0,GarageYrBlt_1960.0,GarageYrBlt_1961.0,GarageYrBlt_1962.0,GarageYrBlt_1963.0,GarageYrBlt_1964.0,GarageYrBlt_1965.0,GarageYrBlt_1966.0,GarageYrBlt_1967.0,GarageYrBlt_1968.0,GarageYrBlt_1969.0,GarageYrBlt_1970.0,GarageYrBlt_1971.0,GarageYrBlt_1972.0,GarageYrBlt_1973.0,GarageYrBlt_1974.0,GarageYrBlt_1975.0,GarageYrBlt_1976.0,GarageYrBlt_1977.0,GarageYrBlt_1978.0,GarageYrBlt_1979.0,GarageYrBlt_1980.0,GarageYrBlt_1981.0,GarageYrBlt_1982.0,GarageYrBlt_1983.0,GarageYrBlt_1984.0,GarageYrBlt_1985.0,GarageYrBlt_1986.0,GarageYrBlt_1987.0,GarageYrBlt_1988.0,GarageYrBlt_1989.0,GarageYrBlt_1990.0,GarageYrBlt_1991.0,GarageYrBlt_1992.0,GarageYrBlt_1993.0,GarageYrBlt_1994.0,GarageYrBlt_1995.0,GarageYrBlt_1996.0,GarageYrBlt_1997.0,GarageYrBlt_1998.0,GarageYrBlt_1999.0,GarageYrBlt_2000.0,GarageYrBlt_2001.0,GarageYrBlt_2002.0,GarageYrBlt_2003.0,GarageYrBlt_2004.0,GarageYrBlt_2005.0,GarageYrBlt_2006.0,GarageYrBlt_2007.0,GarageYrBlt_2008.0,GarageYrBlt_2009.0,GarageYrBlt_2010.0,GarageFinish_NA,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_NA,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_NA,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,PoolQC_Fa,PoolQC_Gd,PoolQC_NA,PoolQC_TA,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NA,MiscFeature_NA,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_VWD,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,856,126000,30,42.0,7890,6,6,1939,1950,0.0,238.0,0.0,618.0,856.0,856,0,0,1.0,0.0,1,0,2,1,4,1,2.0,399.0,0,0,0,0,166,0,0,3,2010,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,1049,139500,120,60.0,4235,5,5,1984,1984,149.0,552.0,393.0,104.0,1049.0,1049,0,0,1.0,0.0,2,0,2,1,5,0,1.0,266.0,0,105,0,0,0,0,0,2,2009,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,1001,124900,30,80.0,6060,5,9,1930,2007,0.0,737.0,0.0,100.0,837.0,1001,0,0,0.0,0.0,1,0,2,1,5,0,1.0,216.0,154,0,42,86,0,0,0,11,2007,1,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,1039,114000,70,70.0,8146,4,8,1900,2003,0.0,0.0,0.0,405.0,405.0,717,322,0,0.0,0.0,1,0,2,1,6,0,1.0,281.0,0,0,168,0,111,0,0,5,2009,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,1665,227000,60,64.0,8400,8,6,2001,2001,0.0,643.0,0.0,167.0,810.0,810,855,0,1.0,0.0,2,1,3,1,6,0,2.0,528.0,0,45,0,0,0,0,0,11,2009,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2574,952,121000,30,68.0,8854,6,6,1916,1950,0.0,0.0,0.0,952.0,952.0,952,0,0,0.0,0.0,1,0,2,1,4,1,1.0,192.0,0,98,0,0,40,0,0,5,2009,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2575,1733,139600,20,79.0,13680,3,5,1955,1955,0.0,0.0,0.0,0.0,0.0,1733,0,0,0.0,0.0,2,0,4,1,8,1,2.0,452.0,0,0,0,0,0,0,0,6,2009,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2576,2002,145000,90,50.0,6270,5,6,1949,1950,0.0,284.0,0.0,717.0,1001.0,1001,1001,0,0.0,0.0,2,0,4,2,8,0,3.0,871.0,0,0,0,0,0,0,0,8,2007,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2577,1842,217500,60,65.0,8826,7,5,2000,2000,144.0,841.0,0.0,144.0,985.0,985,857,0,1.0,0.0,2,1,3,1,7,1,2.0,486.0,193,96,0,0,0,0,0,7,2007,0,0,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


<br><br>
## Eliminating Highly Correlated Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />

In [126]:
# Generate correlation matrix
corr_matrix = dataset_encoded.corr(method="pearson") # pearson? kendall? which is best

In [127]:
# features with high correlation
temp = corr_matrix[(corr_matrix>0.89) & (corr_matrix<1.0)].drop('SalePrice').drop('SalePrice', axis=1)
temp2 = temp[temp[temp.isna().any(axis=1)].sum()>0]
temp2[temp2.index]

Unnamed: 0,Exterior1st_CemntBd,Exterior1st_MetalSd,Exterior1st_VinylSd,Exterior2nd_CmentBd,Exterior2nd_MetalSd,Exterior2nd_VinylSd,SaleType_New,SaleCondition_Partial
Exterior1st_CemntBd,,,,0.978576,,,,
Exterior1st_MetalSd,,,,,0.969285,,,
Exterior1st_VinylSd,,,,,,0.979144,,
Exterior2nd_CmentBd,0.978576,,,,,,,
Exterior2nd_MetalSd,,0.969285,,,,,,
Exterior2nd_VinylSd,,,0.979144,,,,,
SaleType_New,,,,,,,,0.980948
SaleCondition_Partial,,,,,,,0.980948,


In [None]:
# plot the correlation matrix (note: not all of the features fit the plot)
fig=plt.figure(figsize=(20,16))
sns.heatmap(corr_matrix, vmin=-1, vmax=1, center= 0)

<br><br>
## Top 20 Features - Correlation Between Target and Feature
<hr style="height:1px;border:none;color:#333;background-color:#333;" />

In [130]:
# Return the top n features using the correlation between target and feature
abs(corr_matrix['SalePrice']).sort_values(ascending=False).head(20)#.index[1:21]

SalePrice           1.000000
OverallQual         0.790661
GrLivArea           0.719980
TotalBsmtSF         0.652540
1stFlrSF            0.642623
GarageCars          0.638640
GarageArea          0.634706
ExterQual_TA        0.580623
YearBuilt           0.544569
FullBath            0.535175
KitchenQual_TA      0.520063
YearRemodAdd        0.514720
Foundation_PConc    0.507896
MasVnrArea          0.498513
TotRmsAbvGrd        0.490206
Fireplaces          0.488173
FireplaceQu_NA      0.480091
ExterQual_Gd        0.464270
BsmtFinSF1          0.461056
BsmtFinType1_GLQ    0.460768
Name: SalePrice, dtype: float64

<br><br>
## Using Chi-squared to Select Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />
Note: Since the target value is continues, does this method of feature selection is good in this case? All the categorical variables were encoded but is that enough to yield good results?

In [131]:
X = dataset_encoded.drop('SalePrice',axis=1)  # features
y = dataset_encoded['SalePrice']              # target
X_norm = MinMaxScaler().fit_transform(X)      # Transform features by scaling each feature to a given range.

In [287]:
num_f = 20                                    # Number of features to select
chi_selector = SelectKBest(chi2, k=num_f)     # Create selector: Select features according to the k highest scores.
chi_selector.fit(X_norm, y)                   # Run score function on (X,y) and get appropriate features.
chi_support = chi_selector.get_support()      # Get a mask, or integer index, of the features selected.
chi_features = X.loc[:,chi_support].columns.tolist()  # Get list of selected features

In [288]:
chi_features

['LotShape',
 'Utilities',
 'LandSlope',
 'BldgType',
 'YearRemodAdd',
 'RoofMatl',
 'MasVnrArea',
 'BsmtQual',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'HeatingQC',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtHalfBath',
 'HalfBath',
 'GarageType',
 'GarageFinish',
 '3SsnPorch',
 'ScreenPorch']

<br><br>
## Using Lasso Regression to Select Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />

In [132]:
num_f = 20 # number of features to select
# SelectFromModel -> Meta-transformer for selecting features based on importance weights.
embeded_lr_selector = SelectFromModel(Lasso(alpha=0.1), max_features=num_f)
# Penalty="l1" -> indicates regularization using L1
embeded_lr_selector.fit(X_norm, y) # Fit the SelectFromModel meta-transformer.

embeded_lr_support = embeded_lr_selector.get_support() # Get a mask, or integer index, of the features selected
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist() # List of features
print(str(len(embeded_lr_feature)), 'selected features')

20 selected features


In [133]:
embeded_lr_feature

['MSZoning_RL',
 'Alley_NA',
 'LotShape_Reg',
 'LandContour_Lvl',
 'LotConfig_Inside',
 'Condition1_Norm',
 'RoofStyle_Gable',
 'MasVnrType_None',
 'ExterQual_TA',
 'ExterCond_TA',
 'BsmtCond_TA',
 'BsmtExposure_No',
 'BsmtFinType2_Unf',
 'Electrical_SBrkr',
 'Functional_Typ',
 'GarageCond_TA',
 'Fence_NA',
 'MiscFeature_NA',
 'SaleType_WD ',
 'SaleCondition_Normal']

<br><br>
## Using Random Forest to Select Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />

In [295]:
from sklearn.ensemble import RandomForestClassifier
num_f = 20 # number of features to select
# SelectFromModel -> Meta-transformer for selecting features based on importance weights.
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_f)
# n_estimators -> The number of trees in the forest.
embeded_rf_selector.fit(X, y) # Fit the SelectFromModel meta-transformer.

embeded_rf_support = embeded_rf_selector.get_support() # Get a mask, or integer index, of the features selected
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist() # List of features
print(str(len(embeded_rf_feature)), 'selected features')

20 selected features


In [296]:
embeded_rf_feature

['GrLivArea',
 'LotFrontage',
 'LotArea',
 'Neighborhood',
 'YearBuilt',
 'YearRemodAdd',
 'Exterior2nd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'TotRmsAbvGrd',
 'GarageYrBlt',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'MoSold',
 'YrSold']

<br><br>
## Using F Test to Select Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />
Note: Categorical variables have been encoded, is that enough?

In [310]:
value_f, p_f = f_classif(X_norm, y)
pd.Series(p_f, index=X.columns).sort_values().head(20)

OverallQual     3.640980e-271
GrLivArea       2.544883e-177
Street          1.161482e-171
ExterQual       3.977760e-150
1stFlrSF        1.045746e-112
TotalBsmtSF     3.097936e-111
GarageCars      5.613187e-107
GarageArea       1.012057e-98
FullBath         9.895901e-88
YearBuilt        3.198863e-82
KitchenQual      1.176853e-78
GarageYrBlt      1.060503e-77
MasVnrArea       1.407854e-66
BsmtQual         1.285422e-65
Condition2       1.458450e-54
LotArea          3.135413e-50
YearRemodAdd     1.125181e-49
BsmtFinSF1       5.502965e-48
Utilities        3.203043e-42
TotRmsAbvGrd     6.919798e-39
dtype: float64

<br><br>
## Using Mutual Information to Select Features
<hr style="height:1px;border:none;color:#333;background-color:#333;" />

In [326]:
mutual_scores = mutual_info_classif(X_norm, y)
pd.Series(mutual_scores, index=X.columns).sort_values(ascending=False)

Street           2.913278
MiscFeature      2.638525
SaleType         2.520017
Functional       2.430466
CentralAir       2.400599
Alley            2.368144
Electrical       2.344635
KitchenAbvGr     2.340821
GarageCond       2.288948
LandContour      2.285124
GarageQual       2.249480
BsmtCond         2.226046
Condition1       2.220337
PavedDrive       2.203577
Condition2       2.144201
ExterCond        2.094748
Heating          2.049955
SaleCondition    2.034714
BsmtFinType2     1.938199
FullBath         1.849549
Fence            1.803287
ExterQual        1.708581
MSZoning         1.696119
KitchenQual      1.499920
RoofStyle        1.358972
GarageCars       1.317078
GarageType       1.274130
LotConfig        1.259435
BsmtQual         1.196809
BsmtExposure     1.145280
MasVnrType       1.063341
Foundation       1.038312
LotShape         0.997751
PoolQC           0.925429
FireplaceQu      0.916945
OverallQual      0.877785
GarageFinish     0.865945
OverallCond      0.837387
BedroomAbvGr

In [322]:
X.iloc[:,:]

Unnamed: 0,GrLivArea,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,856,30,5.0,42.0,7890,1.0,1.0,3.0,3.0,0.0,0.0,0.0,21.0,2.0,2.0,0.0,2.0,6,6,1939,1950,1.0,0.0,13.0,14.0,2.0,0.0,3.0,4.0,1.0,5.0,5.0,4.0,5.0,238.0,6.0,0.0,618.0,856.0,1.0,4.0,1.0,3.0,856,0,0,1.0,0.0,1,0,2,1,4.0,4,6.0,1,2.0,5.0,30.0,3.0,2.0,399.0,5.0,5.0,2.0,0,0,0,0,166,0,3.0,4.0,1.0,0,3,2010,9.0,4.0
1,1049,120,5.0,60.0,4235,1.0,1.0,3.0,3.0,0.0,4.0,0.0,7.0,2.0,2.0,4.0,2.0,5,5,1984,1984,1.0,0.0,6.0,6.0,1.0,149.0,2.0,4.0,1.0,2.0,5.0,2.0,2.0,552.0,0.0,393.0,104.0,1049.0,1.0,4.0,1.0,3.0,1049,0,0,1.0,0.0,2,0,2,1,2.0,5,6.0,0,3.0,1.0,74.0,0.0,1.0,266.0,5.0,5.0,2.0,0,105,0,0,0,0,3.0,4.0,1.0,0,2,2009,9.0,4.0
2,1001,30,1.0,80.0,6060,1.0,1.0,3.0,3.0,0.0,4.0,0.0,11.0,2.0,2.0,0.0,2.0,5,9,1930,2007,3.0,0.0,8.0,8.0,2.0,0.0,2.0,4.0,0.0,5.0,5.0,4.0,0.0,737.0,6.0,0.0,100.0,837.0,1.0,0.0,1.0,3.0,1001,0,0,0.0,0.0,1,0,2,1,2.0,5,6.0,0,3.0,5.0,21.0,3.0,1.0,216.0,5.0,4.0,0.0,154,0,42,86,0,0,3.0,4.0,1.0,0,11,2007,9.0,4.0
3,1039,70,5.0,70.0,8146,1.0,1.0,3.0,3.0,0.0,0.0,0.0,20.0,2.0,2.0,0.0,5.0,4,8,1900,2003,1.0,0.0,8.0,8.0,2.0,0.0,2.0,2.0,0.0,1.0,5.0,4.0,6.0,0.0,6.0,0.0,405.0,405.0,1.0,2.0,1.0,3.0,717,322,0,0.0,0.0,1,0,2,1,4.0,6,6.0,0,3.0,5.0,31.0,3.0,1.0,281.0,5.0,5.0,0.0,0,0,168,0,111,0,3.0,4.0,1.0,0,5,2009,9.0,4.0
4,1665,60,5.0,64.0,8400,1.0,1.0,3.0,3.0,0.0,4.0,0.0,17.0,2.0,2.0,0.0,5.0,8,6,2001,2001,1.0,0.0,12.0,13.0,2.0,0.0,2.0,4.0,2.0,2.0,5.0,4.0,2.0,643.0,6.0,0.0,167.0,810.0,1.0,0.0,1.0,3.0,810,855,0,1.0,0.0,2,1,3,1,2.0,6,6.0,0,3.0,1.0,91.0,0.0,2.0,528.0,5.0,5.0,2.0,0,45,0,0,0,0,3.0,4.0,1.0,0,11,2009,9.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2574,952,30,5.0,68.0,8854,1.0,1.0,3.0,3.0,0.0,4.0,0.0,3.0,2.0,2.0,0.0,1.0,6,6,1916,1950,1.0,0.0,13.0,14.0,2.0,0.0,3.0,4.0,0.0,5.0,5.0,4.0,6.0,0.0,6.0,0.0,952.0,952.0,3.0,1.0,0.0,1.0,952,0,0,0.0,0.0,1,0,2,1,1.0,4,6.0,1,2.0,5.0,8.0,3.0,1.0,192.0,1.0,4.0,1.0,0,98,0,0,40,0,3.0,4.0,1.0,0,5,2009,9.0,4.0
2575,1733,20,5.0,79.0,13680,1.0,1.0,0.0,3.0,0.0,1.0,0.0,7.0,2.0,2.0,0.0,2.0,3,5,1955,1955,3.0,0.0,3.0,14.0,2.0,0.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1733,0,0,0.0,0.0,2,0,4,1,4.0,8,3.0,1,2.0,1.0,45.0,3.0,2.0,452.0,5.0,5.0,2.0,0,0,0,0,0,0,3.0,4.0,1.0,0,6,2009,9.0,4.0
2576,2002,90,4.0,50.0,6270,1.0,1.0,3.0,1.0,0.0,4.0,0.0,6.0,2.0,2.0,2.0,5.0,5,6,1949,1950,1.0,0.0,8.0,8.0,2.0,0.0,3.0,4.0,1.0,5.0,5.0,4.0,1.0,284.0,6.0,0.0,717.0,1001.0,1.0,4.0,0.0,0.0,1001,1001,0,0.0,0.0,2,0,4,2,4.0,8,6.0,0,3.0,0.0,39.0,3.0,3.0,871.0,5.0,5.0,2.0,0,0,0,0,0,0,3.0,4.0,1.0,0,8,2007,9.0,4.0
2577,1842,60,5.0,65.0,8826,1.0,1.0,3.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,0.0,5.0,7,5,2000,2000,1.0,0.0,12.0,13.0,1.0,144.0,2.0,4.0,2.0,2.0,5.0,4.0,2.0,841.0,6.0,0.0,144.0,985.0,1.0,0.0,1.0,3.0,985,857,0,1.0,0.0,2,1,3,1,2.0,7,6.0,1,5.0,1.0,90.0,0.0,2.0,486.0,5.0,5.0,2.0,193,96,0,0,0,0,3.0,4.0,1.0,0,7,2007,9.0,4.0
