Feature Selection using Backward Elimination

Previously you must have run Task1_clean.ipynb and Task1_clean_pt2.ipynb

In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from openFile import OpenCleanFile

In [8]:
clean = OpenCleanFile()
date_columns = ["SALE PRICE", "SALE DATE", "SALE_MONTH"]
X = clean.df_housing.drop(date_columns, axis=1)
X = pd.get_dummies(X, drop_first=True)
y = clean.df_housing["SALE PRICE"]
X.head()

Unnamed: 0,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,AGE,NEIGHBORHOOD_ALPHABET CITY,NEIGHBORHOOD_ANNADALE,NEIGHBORHOOD_ARDEN HEIGHTS,NEIGHBORHOOD_ARROCHAR,...,BUILDING CLASS AT TIME OF SALE_W2,BUILDING CLASS AT TIME OF SALE_W3,BUILDING CLASS AT TIME OF SALE_W4,BUILDING CLASS AT TIME OF SALE_W9,BUILDING CLASS AT TIME OF SALE_Y3,BUILDING CLASS AT TIME OF SALE_Z9,BOROUGH_NAME_Brooklyn,BOROUGH_NAME_Manhattan,BOROUGH_NAME_Queens,BOROUGH_NAME_Staten Island
0,5,0,5,1633,6440,122,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,10,0,10,2272,6794,109,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,6,0,6,2369,4615,122,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,8,0,8,1750,4226,102,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,24,0,24,4489,18523,102,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


There are too many one hot encoded columns. 
The number of entries for many of them will be too small to make any useful inferences from.
So remove these columns and try again.

In [9]:
delete_columns = ["BUILDING CLASS AT TIME OF SALE", "Tax Block", "BUILDING CLASS CATEGORY", "NEIGHBORHOOD"]
date_columns = ["SALE PRICE", "SALE DATE", "SALE_MONTH"]
X = clean.df_housing.drop(date_columns, axis=1)
X = X.drop(delete_columns, axis=1)
X = pd.get_dummies(X, drop_first=True)
y = clean.df_housing["SALE PRICE"]
X.head()

Unnamed: 0,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,AGE,TAX CLASS AT PRESENT_2,TAX CLASS AT PRESENT_2A,TAX CLASS AT PRESENT_2B,TAX CLASS AT PRESENT_2C,TAX CLASS AT PRESENT_4,TAX CLASS AT TIME OF SALE_2,TAX CLASS AT TIME OF SALE_4,BOROUGH_NAME_Brooklyn,BOROUGH_NAME_Manhattan,BOROUGH_NAME_Queens,BOROUGH_NAME_Staten Island
0,5,0,5,1633,6440,122,0,1,0,0,0,1,0,0,1,0,0
1,10,0,10,2272,6794,109,0,0,1,0,0,1,0,0,1,0,0
2,6,0,6,2369,4615,122,0,1,0,0,0,1,0,0,1,0,0
3,8,0,8,1750,4226,102,0,0,1,0,0,1,0,0,1,0,0
4,24,0,24,4489,18523,102,1,0,0,0,0,1,0,0,1,0,0


In [18]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features])
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        print(f"--------------\nmax p value: {max_p_value}")
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
            print(f"Removing feature: {excluded_feature} with p value {max_p_value}")
        else:
            break 

    print(f"The following features have been selected: {features}")
    return features

In [19]:
p_value_threshold = 0.05
backward_elimination(X, y, p_value_threshold)
print("** Completed **")


--------------
max p value: 0.8810923310809354
Removing feature: TAX CLASS AT PRESENT_2C with p value 0.8810923310809354
--------------
max p value: 0.6090645611497741
Removing feature: BOROUGH_NAME_Staten Island with p value 0.6090645611497741
--------------
max p value: 0.6925360041758055
Removing feature: AGE with p value 0.6925360041758055
--------------
max p value: 0.5567525712862234
Removing feature: COMMERCIAL UNITS with p value 0.5567525712862234
--------------
max p value: 0.06045980339198455
Removing feature: TAX CLASS AT PRESENT_4 with p value 0.06045980339198455
--------------
max p value: 0.04358833056177884
The following features have been selected: ['RESIDENTIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'TAX CLASS AT PRESENT_2', 'TAX CLASS AT PRESENT_2A', 'TAX CLASS AT PRESENT_2B', 'TAX CLASS AT TIME OF SALE_2', 'TAX CLASS AT TIME OF SALE_4', 'BOROUGH_NAME_Brooklyn', 'BOROUGH_NAME_Manhattan', 'BOROUGH_NAME_Queens']
** Completed **


This ends the backward elimination.
All the remaining columns have P values less than 0.05.
They are the following;
['RESIDENTIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'TAX CLASS AT PRESENT_2', 'TAX CLASS AT PRESENT_2A', 'TAX CLASS AT PRESENT_2B', 'TAX CLASS AT TIME OF SALE_2', 'TAX CLASS AT TIME OF SALE_4', 'BOROUGH_NAME_Brooklyn', 'BOROUGH_NAME_Manhattan', 'BOROUGH_NAME_Queens']