In [12]:
from pandas import read_csv
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
# from pandas.plotting import scatter_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.model_selection import train_test_split # splitting data
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import make_pipeline           # for making pipelines
# from sklearn.neighbors import KNeighborsRegressor    # regressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import AdaBoostRegressor
import warnings
warnings.filterwarnings('ignore')


In [67]:
#read in the en csv files and clean up the column names
all_data = read_csv('data/data.csv')
all_data.isnull().sum()
for col_name in all_data.columns:
    new_col_name = col_name.split(None, 1)[1:]
    new_col_name = ''.join(new_col_name)
    all_data.rename(columns = {col_name:new_col_name[:-33]}, inplace = True)
all_data.columns = all_data.columns.str.replace('-', ' ')
all_data.columns = all_data.columns.str.replace(' ', '_')

all_data = all_data[pd.notnull(all_data['Median_Listing_Price___All_Homes'])]
##### renaming outcome and splitting feature & outcome since we do not want to scale outcomes
all_data = all_data.rename(index=str, columns={"Median_Listing_Price___All_Homes": "outcome"})
outcomes = pd.DataFrame(all_data.outcome)
all_data.drop('outcome', axis=1, inplace=True)
#all_data.dtypes

In [68]:
# Add two new columns that individually record year and month of that row
month = []
year = []
for date in all_data['']:
    month.append(date[0])
    year.append(date[-4:])

all_data['month'] = month
all_data['year'] = year
all_data = all_data.drop('', axis=1)
#all_data = all_data.drop('Zillow_Home_Value_Index_-_Top_Tier_-_Year-Over...', axis=1)

all_data['month'] = all_data['month'].astype('int64')
all_data['year']=all_data['year'].astype('int64')

col_name = all_data.columns

In [69]:
# Handling missing values
from sklearn.impute import SimpleImputer
impu = SimpleImputer()
all_data = pd.DataFrame(impu.fit_transform(all_data))
all_data.columns = col_name
#outcomes.shape
all_data.shape

(109, 88)

In [66]:
############### We don't want to scale the outcome variable, only features
############### Also make sure which features to standardize or normalize
# #standardized

# from sklearn import preprocessing
# # Get column names first
# names = all_data.columns
# # Create the Scaler object
# std_scaler = preprocessing.StandardScaler()
# # Fit your data on the scaler object
# scaled_data = std_scaler.fit_transform(all_data)
# scaled_data = pd.DataFrame(scaled_data, columns=names)

#normalize
from sklearn import preprocessing
# Normalize total_bedrooms column
names = all_data.columns
scaled_data = preprocessing.normalize(all_data)
scaled_data = pd.DataFrame(scaled_data, columns=names)
scaled_data = scaled_data[(scaled_data > 0).all(1)] ############ What is this for? removed some rows, breaks train split
scaled_data.shape

(76, 88)

In [46]:
####### checking variance, for variance selector we need to specify a threshold (e.g. 0.01) to actually remove features
####### or use along with select percentile
scaled_data.var()

Home_Sold_As_Foreclosure___Ratio___All_Homes         1.702905e-12
Inventory_Measure_(Public)                           2.940434e-08
Inventory_Measure___SSA_(Public)                     2.352524e-08
Listings_Price_Cuts_(SA)_All_Homes                   2.483968e-12
Listings_Price_Cuts_(SA)_Condominiums                3.487139e-12
Listings_Price_Cuts_(SA)_Single_Family_Residence     2.259996e-12
Median_Listing_Price___Condo/Co_op                   2.233643e-05
Median_Listing_Price___Duplex/Triplex                9.804376e-04
Median_Listing_Price___Five_Or_More_Bedrooms         1.520013e-03
Median_Listing_Price___Four_Bedrooms                 3.016368e-04
Median_Listing_Price___One_Bedroom                   6.589856e-05
Median_Listing_Price___Single_Family_Residence       3.647415e-05
Median_Listing_Price___Three_Bedrooms                1.762007e-05
Median_Listing_Price___Two_Bedrooms                  2.515699e-05
Median_Listing_Price_Per_Square_Foot___All_Homes     7.148689e-12
Median_Lis

In [8]:
#Removing features with low variance
def variance_threshold_selector(data):
    sel = VarianceThreshold() ###### need to specify threshold value or it won't remove
    sel.fit(data)
    return data[data.columns[sel.get_support(indices=True)]]


var_thrhold = variance_threshold_selector(scaled_data) 


scaled_data.dtypes

Home_Sold_As_Foreclosure___Ratio___All_Homes         float64
Inventory_Measure_(Public)                           float64
Inventory_Measure___SSA_(Public)                     float64
Listings_Price_Cuts_(SA)_All_Homes                   float64
Listings_Price_Cuts_(SA)_Condominiums                float64
Listings_Price_Cuts_(SA)_Single_Family_Residence     float64
Median_Listing_Price___All_Homes                     float64
Median_Listing_Price___Condo/Co_op                   float64
Median_Listing_Price___Duplex/Triplex                float64
Median_Listing_Price___Five_Or_More_Bedrooms         float64
Median_Listing_Price___Four_Bedrooms                 float64
Median_Listing_Price___One_Bedroom                   float64
Median_Listing_Price___Single_Family_Residence       float64
Median_Listing_Price___Three_Bedrooms                float64
Median_Listing_Price___Two_Bedrooms                  float64
Median_Listing_Price_Per_Square_Foot___All_Homes     float64
Median_Listing_Price_Per

In [70]:
#Univariate feature selection
#def univariate_feature_selector(data, k=7):
#    uni_select = SelectKBest(chi2, k)
#    uni_select.fit_transform(data,data.Median_Listing_Price___All_Homes)
#    return data[data.columns[uni_select.get_support(indices=True)]]

#house_uni = univariate_feature_selector(scaled_data, k=7)

#Feature importance based off of gbr
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor

gbrfit = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(train_x, train_y)
model = SelectFromModel(gbrfit, prefit=True)
#train_x = model.transform(train_x)
print(model.feature_importances_)

NameError: name 'train_x' is not defined

In [65]:
# Split data into test and training data with a test size of 30% (.3)
from sklearn.model_selection import train_test_split # typically done at the start
train_x, test_x, train_y, test_y = train_test_split(
    scaled_data, # features
    outcomes, # outcome
    random_state = 11,
    test_size=0.3 # percentage of data to use as the test set
)

ValueError: Found input variables with inconsistent numbers of samples: [76, 109]