Feature Selection using Backward Elimination

In [9]:
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import statsmodels.api as sm 
from openFile import OpenCleanFile

In [11]:
clean = OpenCleanFile()
date_columns = ["SALE PRICE", "SALE DATE", "SALE_MONTH"]
X = clean.df_housing.drop(date_columns, axis=1)
X = pd.get_dummies(X, drop_first=True)
y = clean.df_housing["SALE PRICE"]
X.head()

Unnamed: 0,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,AGE,NEIGHBORHOOD_ALPHABET CITY,NEIGHBORHOOD_ANNADALE,NEIGHBORHOOD_ARDEN HEIGHTS,NEIGHBORHOOD_ARROCHAR,...,BUILDING CLASS AT TIME OF SALE_W2,BUILDING CLASS AT TIME OF SALE_W3,BUILDING CLASS AT TIME OF SALE_W4,BUILDING CLASS AT TIME OF SALE_W9,BUILDING CLASS AT TIME OF SALE_Y3,BUILDING CLASS AT TIME OF SALE_Z9,BOROUGH_NAME_Brooklyn,BOROUGH_NAME_Manhattan,BOROUGH_NAME_Queens,BOROUGH_NAME_Staten Island
0,5,0,5,1633,6440,122,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,10,0,10,2272,6794,109,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,6,0,6,2369,4615,122,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,8,0,8,1750,4226,102,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,24,0,24,4489,18523,102,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


There are too many one hot encoded columns. 
The number of entries for many of them will be too small to make any useful inferences from.
These are the columns that returned "not a number (nam) in the ANOVA tests in the previous script.
So remove these columns and try again.

In [12]:
delete_columns = ["BUILDING CLASS AT TIME OF SALE", "Tax Block", "BUILDING CLASS CATEGORY", "NEIGHBORHOOD"]
date_columns = ["SALE PRICE", "SALE DATE", "SALE_MONTH"]
X = clean.df_housing.drop(date_columns, axis=1)
X = X.drop(delete_columns, axis=1)
X = pd.get_dummies(X, drop_first=True)
y = clean.df_housing["SALE PRICE"]
X.head()

Unnamed: 0,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,AGE,TAX CLASS AT PRESENT_2,TAX CLASS AT PRESENT_2A,TAX CLASS AT PRESENT_2B,TAX CLASS AT PRESENT_2C,TAX CLASS AT PRESENT_4,TAX CLASS AT TIME OF SALE_2,TAX CLASS AT TIME OF SALE_4,BOROUGH_NAME_Brooklyn,BOROUGH_NAME_Manhattan,BOROUGH_NAME_Queens,BOROUGH_NAME_Staten Island
0,5,0,5,1633,6440,122,0,1,0,0,0,1,0,0,1,0,0
1,10,0,10,2272,6794,109,0,0,1,0,0,1,0,0,1,0,0
2,6,0,6,2369,4615,122,0,1,0,0,0,1,0,0,1,0,0
3,8,0,8,1750,4226,102,0,0,1,0,0,1,0,0,1,0,0
4,24,0,24,4489,18523,102,1,0,0,0,0,1,0,0,1,0,0


In [13]:
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.444
Model:,OLS,Adj. R-squared (uncentered):,0.443
Method:,Least Squares,F-statistic:,398.2
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,15:33:33,Log-Likelihood:,-148310.0
No. Observations:,8482,AIC:,296600.0
Df Residuals:,8465,BIC:,296800.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RESIDENTIAL UNITS,-9.086e+05,1.37e+06,-0.665,0.506,-3.59e+06,1.77e+06
COMMERCIAL UNITS,-8.096e+05,1.37e+06,-0.593,0.553,-3.49e+06,1.87e+06
TOTAL UNITS,8.234e+05,1.37e+06,0.603,0.547,-1.85e+06,3.5e+06
LAND SQUARE FEET,-228.7189,16.234,-14.089,0.000,-260.542,-196.896
GROSS SQUARE FEET,292.1042,5.894,49.561,0.000,280.551,303.658
AGE,447.7406,2413.158,0.186,0.853,-4282.638,5178.119
TAX CLASS AT PRESENT_2,9.096e+06,3.9e+06,2.331,0.020,1.45e+06,1.67e+07
TAX CLASS AT PRESENT_2A,6.976e+06,3.89e+06,1.792,0.073,-6.54e+05,1.46e+07
TAX CLASS AT PRESENT_2B,7.628e+06,3.9e+06,1.955,0.051,-2.18e+04,1.53e+07

0,1,2,3
Omnibus:,10993.738,Durbin-Watson:,1.539
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11275728.229
Skew:,6.54,Prob(JB):,0.0
Kurtosis:,181.14,Cond. No.,3700000.0


The GROSS SQUARE FEET column has the highest p value so remove that.

In [17]:
X.drop(columns=["GROSS SQUARE FEET"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.283
Model:,OLS,Adj. R-squared (uncentered):,0.282
Method:,Least Squares,F-statistic:,208.9
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:01:17,Log-Likelihood:,-149390.0
No. Observations:,8482,AIC:,298800.0
Df Residuals:,8466,BIC:,298900.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RESIDENTIAL UNITS,-2.614e+06,1.55e+06,-1.686,0.092,-5.65e+06,4.25e+05
COMMERCIAL UNITS,-2.664e+06,1.55e+06,-1.718,0.086,-5.7e+06,3.75e+05
TOTAL UNITS,2.683e+06,1.55e+06,1.730,0.084,-3.56e+05,5.72e+06
LAND SQUARE FEET,127.0657,16.538,7.683,0.000,94.648,159.484
AGE,-7935.9983,2734.097,-2.903,0.004,-1.33e+04,-2576.500
TAX CLASS AT PRESENT_2,1.031e+07,4.43e+06,2.326,0.020,1.62e+06,1.9e+07
TAX CLASS AT PRESENT_2A,7.547e+06,4.42e+06,1.707,0.088,-1.12e+06,1.62e+07
TAX CLASS AT PRESENT_2B,7.735e+06,4.43e+06,1.745,0.081,-9.53e+05,1.64e+07
TAX CLASS AT PRESENT_2C,-2.84e+06,1.17e+07,-0.244,0.808,-2.57e+07,2e+07

0,1,2,3
Omnibus:,14393.344,Durbin-Watson:,1.482
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16075847.294
Skew:,11.586,Prob(JB):,0.0
Kurtosis:,215.014,Cond. No.,1200000.0


The LAND SQUARE FEET column has the highest p value so remove that.

In [18]:
X.drop(columns=["LAND SQUARE FEET"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.278
Model:,OLS,Adj. R-squared (uncentered):,0.277
Method:,Least Squares,F-statistic:,217.4
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:03:08,Log-Likelihood:,-149420.0
No. Observations:,8482,AIC:,298900.0
Df Residuals:,8467,BIC:,299000.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RESIDENTIAL UNITS,-2.764e+06,1.56e+06,-1.777,0.076,-5.81e+06,2.85e+05
COMMERCIAL UNITS,-2.851e+06,1.56e+06,-1.833,0.067,-5.9e+06,1.98e+05
TOTAL UNITS,2.872e+06,1.56e+06,1.846,0.065,-1.77e+05,5.92e+06
AGE,-6532.2618,2737.319,-2.386,0.017,-1.19e+04,-1166.448
TAX CLASS AT PRESENT_2,9.989e+06,4.45e+06,2.246,0.025,1.27e+06,1.87e+07
TAX CLASS AT PRESENT_2A,7.599e+06,4.44e+06,1.713,0.087,-1.1e+06,1.63e+07
TAX CLASS AT PRESENT_2B,7.765e+06,4.45e+06,1.746,0.081,-9.52e+05,1.65e+07
TAX CLASS AT PRESENT_2C,-2.697e+06,1.17e+07,-0.230,0.818,-2.56e+07,2.02e+07
TAX CLASS AT PRESENT_4,-4.061e+06,5.88e+06,-0.691,0.489,-1.56e+07,7.46e+06

0,1,2,3
Omnibus:,14427.176,Durbin-Watson:,1.502
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16568629.319
Skew:,11.632,Prob(JB):,0.0
Kurtosis:,218.267,Cond. No.,10200.0


The BOROUGH_NAME_Manhattan column has the highest p value so remove that.

In [19]:
X.drop(columns=["BOROUGH_NAME_Manhattan"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.241
Model:,OLS,Adj. R-squared (uncentered):,0.24
Method:,Least Squares,F-statistic:,191.9
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:05:05,Log-Likelihood:,-149630.0
No. Observations:,8482,AIC:,299300.0
Df Residuals:,8468,BIC:,299400.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RESIDENTIAL UNITS,-2.193e+06,1.59e+06,-1.375,0.169,-5.32e+06,9.33e+05
COMMERCIAL UNITS,-2.283e+06,1.59e+06,-1.431,0.152,-5.41e+06,8.44e+05
TOTAL UNITS,2.303e+06,1.59e+06,1.444,0.149,-8.23e+05,5.43e+06
AGE,2.149e+04,2446.863,8.784,0.000,1.67e+04,2.63e+04
TAX CLASS AT PRESENT_2,1.148e+07,4.56e+06,2.518,0.012,2.54e+06,2.04e+07
TAX CLASS AT PRESENT_2A,7.047e+06,4.55e+06,1.549,0.121,-1.87e+06,1.6e+07
TAX CLASS AT PRESENT_2B,8.848e+06,4.56e+06,1.940,0.052,-9.08e+04,1.78e+07
TAX CLASS AT PRESENT_2C,3.247e+06,1.2e+07,0.271,0.787,-2.03e+07,2.68e+07
TAX CLASS AT PRESENT_4,-6.502e+06,6.02e+06,-1.080,0.280,-1.83e+07,5.3e+06

0,1,2,3
Omnibus:,14402.073,Durbin-Watson:,1.449
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15821804.918
Skew:,11.61,Prob(JB):,0.0
Kurtosis:,213.307,Cond. No.,10200.0


The TAX CLASS AT PRESENT_2C column has the highest p value so remove that.

In [20]:
X.drop(columns=["TAX CLASS AT PRESENT_2C"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.241
Model:,OLS,Adj. R-squared (uncentered):,0.24
Method:,Least Squares,F-statistic:,206.7
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:07:54,Log-Likelihood:,-149630.0
No. Observations:,8482,AIC:,299300.0
Df Residuals:,8469,BIC:,299400.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RESIDENTIAL UNITS,-2.198e+06,1.59e+06,-1.378,0.168,-5.32e+06,9.28e+05
COMMERCIAL UNITS,-2.287e+06,1.59e+06,-1.434,0.151,-5.41e+06,8.38e+05
TOTAL UNITS,2.308e+06,1.59e+06,1.447,0.148,-8.18e+05,5.43e+06
AGE,2.15e+04,2446.570,8.788,0.000,1.67e+04,2.63e+04
TAX CLASS AT PRESENT_2,1.102e+07,4.22e+06,2.608,0.009,2.73e+06,1.93e+07
TAX CLASS AT PRESENT_2A,6.583e+06,4.21e+06,1.563,0.118,-1.67e+06,1.48e+07
TAX CLASS AT PRESENT_2B,8.385e+06,4.23e+06,1.984,0.047,9.94e+04,1.67e+07
TAX CLASS AT PRESENT_4,-6.736e+06,5.96e+06,-1.130,0.259,-1.84e+07,4.95e+06
TAX CLASS AT TIME OF SALE_2,-6.203e+06,4.2e+06,-1.476,0.140,-1.44e+07,2.03e+06

0,1,2,3
Omnibus:,14401.974,Durbin-Watson:,1.449
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15821197.07
Skew:,11.61,Prob(JB):,0.0
Kurtosis:,213.303,Cond. No.,7630.0


The TAX CLASS AT PRESENT_4 column has the highest p value so remove that.

In [21]:
X.drop(columns=["TAX CLASS AT PRESENT_4"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.241
Model:,OLS,Adj. R-squared (uncentered):,0.24
Method:,Least Squares,F-statistic:,223.8
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:09:14,Log-Likelihood:,-149630.0
No. Observations:,8482,AIC:,299300.0
Df Residuals:,8470,BIC:,299400.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RESIDENTIAL UNITS,-2.067e+06,1.59e+06,-1.300,0.194,-5.18e+06,1.05e+06
COMMERCIAL UNITS,-2.157e+06,1.59e+06,-1.356,0.175,-5.27e+06,9.61e+05
TOTAL UNITS,2.177e+06,1.59e+06,1.369,0.171,-9.4e+05,5.29e+06
AGE,2.151e+04,2446.600,8.791,0.000,1.67e+04,2.63e+04
TAX CLASS AT PRESENT_2,1.271e+07,3.95e+06,3.216,0.001,4.96e+06,2.04e+07
TAX CLASS AT PRESENT_2A,8.264e+06,3.94e+06,2.097,0.036,5.37e+05,1.6e+07
TAX CLASS AT PRESENT_2B,1.006e+07,3.96e+06,2.542,0.011,2.3e+06,1.78e+07
TAX CLASS AT TIME OF SALE_2,-7.883e+06,3.93e+06,-2.005,0.045,-1.56e+07,-1.78e+05
TAX CLASS AT TIME OF SALE_4,1.424e+07,4.51e+05,31.555,0.000,1.34e+07,1.51e+07

0,1,2,3
Omnibus:,14399.775,Durbin-Watson:,1.449
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15801113.591
Skew:,11.607,Prob(JB):,0.0
Kurtosis:,213.168,Cond. No.,6120.0


The RESIDENTIAL UNITS column has the highest p value so remove that.

In [22]:
X.drop(columns=["RESIDENTIAL UNITS"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.241
Model:,OLS,Adj. R-squared (uncentered):,0.24
Method:,Least Squares,F-statistic:,244.0
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:10:31,Log-Likelihood:,-149630.0
No. Observations:,8482,AIC:,299300.0
Df Residuals:,8471,BIC:,299400.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
COMMERCIAL UNITS,-8.955e+04,7002.332,-12.788,0.000,-1.03e+05,-7.58e+04
TOTAL UNITS,1.103e+05,5129.508,21.498,0.000,1e+05,1.2e+05
AGE,2.147e+04,2446.543,8.776,0.000,1.67e+04,2.63e+04
TAX CLASS AT PRESENT_2,1.271e+07,3.95e+06,3.218,0.001,4.97e+06,2.05e+07
TAX CLASS AT PRESENT_2A,8.267e+06,3.94e+06,2.097,0.036,5.4e+05,1.6e+07
TAX CLASS AT PRESENT_2B,1.007e+07,3.96e+06,2.543,0.011,2.31e+06,1.78e+07
TAX CLASS AT TIME OF SALE_2,-7.886e+06,3.93e+06,-2.006,0.045,-1.56e+07,-1.8e+05
TAX CLASS AT TIME OF SALE_4,1.42e+07,4.5e+05,31.538,0.000,1.33e+07,1.51e+07
BOROUGH_NAME_Brooklyn,-1.492e+06,2.76e+05,-5.405,0.000,-2.03e+06,-9.51e+05

0,1,2,3
Omnibus:,14402.355,Durbin-Watson:,1.449
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15811294.397
Skew:,11.611,Prob(JB):,0.0
Kurtosis:,213.236,Cond. No.,6100.0


The BOROUGH_NAME_Staten Island column has the highest p value so remove that.


In [23]:
X.drop(columns=["BOROUGH_NAME_Staten Island"], inplace=True)
regressor_OLS = sm.OLS(endog = y, exog = X).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,SALE PRICE,R-squared (uncentered):,0.24
Model:,OLS,Adj. R-squared (uncentered):,0.239
Method:,Least Squares,F-statistic:,267.9
Date:,"Mon, 05 Dec 2022",Prob (F-statistic):,0.0
Time:,16:11:50,Log-Likelihood:,-149630.0
No. Observations:,8482,AIC:,299300.0
Df Residuals:,8472,BIC:,299400.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
COMMERCIAL UNITS,-8.942e+04,7002.966,-12.768,0.000,-1.03e+05,-7.57e+04
TOTAL UNITS,1.102e+05,5129.857,21.474,0.000,1e+05,1.2e+05
AGE,2.078e+04,2418.115,8.594,0.000,1.6e+04,2.55e+04
TAX CLASS AT PRESENT_2,1.272e+07,3.95e+06,3.220,0.001,4.98e+06,2.05e+07
TAX CLASS AT PRESENT_2A,8.245e+06,3.94e+06,2.091,0.037,5.17e+05,1.6e+07
TAX CLASS AT PRESENT_2B,1.006e+07,3.96e+06,2.540,0.011,2.3e+06,1.78e+07
TAX CLASS AT TIME OF SALE_2,-7.847e+06,3.93e+06,-1.996,0.046,-1.56e+07,-1.4e+05
TAX CLASS AT TIME OF SALE_4,1.421e+07,4.5e+05,31.539,0.000,1.33e+07,1.51e+07
BOROUGH_NAME_Brooklyn,-1.431e+06,2.74e+05,-5.222,0.000,-1.97e+06,-8.94e+05

0,1,2,3
Omnibus:,14407.945,Durbin-Watson:,1.447
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15827428.911
Skew:,11.621,Prob(JB):,0.0
Kurtosis:,213.342,Cond. No.,6100.0


This ends the backward elimination.
All the remaining columns have P values less than 0.05.
They are the following;
COMMERCIAL UNITS, TOTAL UNITS, AGE, 
TAX CLASS AT PRESENT_2, TAX CLASS AT PRESENT_2A, TAX CLASS AT PRESENT_2B, 
TAX CLASS AT TIME OF SALE_2, TAX CLASS AT TIME OF SALE_4, 
BOROUGH_NAME_Brooklyn, BOROUGH_NAME_Queens