In [1]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.formula.api import ols
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

import model
import split_scale

In [2]:
zillow = pd.read_csv('zillow_FE.csv')
zillow = zillow.drop(columns=['county', 'state'])

In [3]:
train, test = split_scale.split_my_data(zillow, 0.8)
train.head()

Unnamed: 0.1,Unnamed: 0,index,parcelid,neighborhood,value,baths,beds,home_sf,is_extra,tax_rate,size_ratio,factor,sq_feet_proxy6
88,88,137,17129832,0,425115,3.0,6,2901,0,0.011,0.395,309.836,9.363
4837,4837,7581,17227544,0,614179,2.0,4,2402,1,0.012,0.293,309.836,7.752
3411,3411,5359,11346241,0,466643,3.0,4,2557,1,0.015,0.414,309.836,8.253
4426,4426,6933,11216731,0,202444,2.0,3,1503,0,0.021,0.193,309.836,4.851
3969,3969,6233,13063534,0,179911,1.0,3,1092,0,0.013,0.181,309.836,3.524


In [4]:
# Try out each of the scale functions, find the one that works best
std_scaler, train_scaled_std, test_scaled_std = split_scale.standard_scaler(train, test)
unf_scaler, train_scaled_unf, test_scaled_unf = split_scale.uniform_scaler(train, test)
gs_scaler, train_scaled_gs, test_scaled_gs = split_scale.gaussian_scaler(train, test)
mm_scaler, train_scaled_mm, test_scaled_mm = split_scale.min_max_scaler(train, test)
iqr_scaler, train_scaled_iqr, test_scaled_iqr = split_scale.iqr_robust_scaler(train, test)

In [5]:
# set up train features
y = train[['value']]
X_std = train_scaled_std[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]
X_unf = train_scaled_unf[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]
X_gs = train_scaled_gs[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]
X_mm = train_scaled_mm[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]
X_iqr = train_scaled_iqr[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]

In [6]:
# Create predictions dataframe
predictions_std = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions_unf = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions_gs = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions_mm = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions_iqr = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})

In [7]:
# MVP model for comparison
X = train[['baths', 'beds', 'home_sf']]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y)

predictions_std['MVP'] = lm_poly.predict(X_poly)
predictions_unf['MVP'] = lm_poly.predict(X_poly)
predictions_gs['MVP'] = lm_poly.predict(X_poly)
predictions_mm['MVP'] = lm_poly.predict(X_poly)
predictions_iqr['MVP'] = lm_poly.predict(X_poly)

In [8]:
# Compare all features
poly = PolynomialFeatures(degree=6)

X_poly_std = poly.fit_transform(X_std)
lm_all_std = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_std, y)
predictions_std['all'] = lm_all_std.predict(X_std)

X_poly_unf = poly.fit_transform(X_unf)
lm_all_unf = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_unf, y)
predictions_unf['all'] = lm_all_unf.predict(X_unf)

X_poly_gs = poly.fit_transform(X_gs)
lm_all_gs = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_gs, y)
predictions_gs['all'] = lm_all_gs.predict(X_gs)

X_poly_mm = poly.fit_transform(X_mm)
lm_all_mm = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_mm, y)
predictions_mm['all'] = lm_all_mm.predict(X_mm)

X_poly_iqr = poly.fit_transform(X_iqr)
lm_all_iqr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_iqr, y)
predictions_iqr['all'] = lm_all_iqr.predict(X_iqr)

In [9]:
# use RFE to select top 2
model.select_rfe(X_std, y, 2), model.select_rfe(X_unf, y, 2), model.select_rfe(X_gs, y, 2), model.select_rfe(X_mm, y, 2), model.select_rfe(X_iqr, y, 2)

['beds', 'home_sf']
['baths', 'home_sf']
['home_sf', 'is_extra']
['beds', 'home_sf']
['beds', 'home_sf']


(None, None, None, None, None)

In [10]:
X_rfe2_std = X_std[['beds', 'home_sf']]
X_rfe2_unf = X_unf[['baths', 'home_sf']]
X_rfe2_gs = X_gs[['home_sf', 'is_extra']]
X_rfe2_mm = X_mm[['beds', 'home_sf']]
X_rfe2_iqr = X_iqr[['beds', 'home_sf']]

In [11]:
# Train rfe2 models
poly = PolynomialFeatures(degree=2)

X_poly_std = poly.fit_transform(X_rfe2_std)
lm_rfe_std = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_std, y)
predictions_std['rfe2'] = lm_rfe_std.predict(X_poly_std)

X_poly_unf = poly.fit_transform(X_rfe2_unf)
lm_rfe_unf = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_unf, y)
predictions_unf['rfe2'] = lm_rfe_unf.predict(X_poly_unf)

X_poly_gs = poly.fit_transform(X_rfe2_gs)
lm_rfe_gs = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_gs, y)
predictions_gs['rfe2'] = lm_rfe_gs.predict(X_poly_gs)

X_poly_mm = poly.fit_transform(X_rfe2_mm)
lm_rfe_mm = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_mm, y)
predictions_mm['rfe2'] = lm_rfe_mm.predict(X_poly_mm)

X_poly_iqr = poly.fit_transform(X_rfe2_iqr)
lm_rfe_iqr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_iqr, y)
predictions_iqr['rfe2'] = lm_rfe_iqr.predict(X_poly_iqr)

In [12]:
# use RFE to select top 3
model.select_rfe(X_std, y, 3), model.select_rfe(X_unf, y, 3), model.select_rfe(X_gs, y, 3), model.select_rfe(X_mm, y, 3), model.select_rfe(X_iqr, y, 3)

['beds', 'home_sf', 'sq_feet_proxy6']
['baths', 'home_sf', 'sq_feet_proxy6']
['home_sf', 'is_extra', 'size_ratio']
['baths', 'beds', 'home_sf']
['beds', 'home_sf', 'sq_feet_proxy6']


(None, None, None, None, None)

In [13]:
X_rfe3_std = X_std[['beds', 'home_sf', 'sq_feet_proxy6']]
X_rfe3_unf = X_unf[['baths', 'home_sf', 'sq_feet_proxy6']]
X_rfe3_gs = X_gs[['home_sf', 'is_extra', 'size_ratio']]
X_rfe3_mm = X_mm[['baths', 'beds', 'home_sf']]
X_rfe3_iqr = X_iqr[['beds', 'home_sf', 'sq_feet_proxy6']]

In [14]:
# Train rfe3 models
poly = PolynomialFeatures(degree=3)

X_poly_std = poly.fit_transform(X_rfe3_std)
lm_rfe_std = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_std, y)
predictions_std['rfe3'] = lm_rfe_std.predict(X_poly_std)

X_poly_unf = poly.fit_transform(X_rfe3_unf)
lm_rfe_unf = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_unf, y)
predictions_unf['rfe3'] = lm_rfe_unf.predict(X_poly_unf)

X_poly_gs = poly.fit_transform(X_rfe3_gs)
lm_rfe_gs = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_gs, y)
predictions_gs['rfe3'] = lm_rfe_gs.predict(X_poly_gs)

X_poly_mm = poly.fit_transform(X_rfe3_mm)
lm_rfe_mm = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_mm, y)
predictions_mm['rfe3'] = lm_rfe_mm.predict(X_poly_mm)

X_poly_iqr = poly.fit_transform(X_rfe3_iqr)
lm_rfe_iqr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_iqr, y)
predictions_iqr['rfe3'] = lm_rfe_iqr.predict(X_poly_iqr)

In [15]:
# use RFE to select top 4
model.select_rfe(X_std, y, 4), model.select_rfe(X_unf, y, 4), model.select_rfe(X_gs, y, 4), model.select_rfe(X_mm, y, 4), model.select_rfe(X_iqr, y, 4)

['baths', 'beds', 'home_sf', 'sq_feet_proxy6']
['baths', 'beds', 'home_sf', 'sq_feet_proxy6']
['baths', 'home_sf', 'is_extra', 'size_ratio']
['baths', 'beds', 'home_sf', 'sq_feet_proxy6']
['baths', 'beds', 'home_sf', 'sq_feet_proxy6']


(None, None, None, None, None)

In [16]:
X_rfe4_std = X_std[['baths', 'beds', 'home_sf', 'sq_feet_proxy6']]
X_rfe4_unf = X_unf[['baths', 'beds', 'home_sf', 'sq_feet_proxy6']]
X_rfe4_gs = X_gs[['baths', 'home_sf', 'is_extra', 'size_ratio']]
X_rfe4_mm = X_mm[['baths', 'beds', 'home_sf', 'sq_feet_proxy6']]
X_rfe4_iqr = X_iqr[['baths', 'beds', 'home_sf', 'sq_feet_proxy6']]

In [17]:
# Train rfe4 models
poly = PolynomialFeatures(degree=4)

X_poly_std = poly.fit_transform(X_rfe4_std)
lm_rfe_std = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_std, y)
predictions_std['rfe4'] = lm_rfe_std.predict(X_poly_std)

X_poly_unf = poly.fit_transform(X_rfe4_unf)
lm_rfe_unf = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_unf, y)
predictions_unf['rfe4'] = lm_rfe_unf.predict(X_poly_unf)

X_poly_gs = poly.fit_transform(X_rfe4_gs)
lm_rfe_gs = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_gs, y)
predictions_gs['rfe4'] = lm_rfe_gs.predict(X_poly_gs)

X_poly_mm = poly.fit_transform(X_rfe4_mm)
lm_rfe_mm = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_mm, y)
predictions_mm['rfe4'] = lm_rfe_mm.predict(X_poly_mm)

X_poly_iqr = poly.fit_transform(X_rfe4_iqr)
lm_rfe_iqr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_iqr, y)
predictions_iqr['rfe4'] = lm_rfe_iqr.predict(X_poly_iqr)

In [18]:
# use RFE to select top 5
model.select_rfe(X_std, y, 5), model.select_rfe(X_unf, y, 5), model.select_rfe(X_gs, y, 5), model.select_rfe(X_mm, y, 5), model.select_rfe(X_iqr, y, 5)

['baths', 'beds', 'home_sf', 'size_ratio', 'sq_feet_proxy6']
['baths', 'beds', 'home_sf', 'is_extra', 'sq_feet_proxy6']
['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio']
['baths', 'beds', 'home_sf', 'size_ratio', 'sq_feet_proxy6']
['baths', 'beds', 'home_sf', 'size_ratio', 'sq_feet_proxy6']


(None, None, None, None, None)

In [19]:
X_rfe5_std = X_std[['baths', 'beds', 'home_sf', 'size_ratio', 'sq_feet_proxy6']]
X_rfe5_unf = X_unf[['baths', 'beds', 'home_sf', 'is_extra', 'sq_feet_proxy6']]
X_rfe5_gs = X_gs[['baths', 'beds', 'home_sf', 'is_extra', 'sq_feet_proxy6']]
X_rfe5_mm = X_mm[['baths', 'beds', 'home_sf', 'size_ratio', 'sq_feet_proxy6']]
X_rfe5_iqr = X_iqr[['baths', 'beds', 'home_sf', 'size_ratio', 'sq_feet_proxy6']]

In [20]:
# Train rfe5 models
poly = PolynomialFeatures(degree=5)

X_poly_std = poly.fit_transform(X_rfe5_std)
lm_rfe_std = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_std, y)
predictions_std['rfe5'] = lm_rfe_std.predict(X_poly_std)

X_poly_unf = poly.fit_transform(X_rfe5_unf)
lm_rfe_unf = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_unf, y)
predictions_unf['rfe5'] = lm_rfe_unf.predict(X_poly_unf)

X_poly_gs = poly.fit_transform(X_rfe5_gs)
lm_rfe_gs = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_gs, y)
predictions_gs['rfe5'] = lm_rfe_gs.predict(X_poly_gs)

X_poly_mm = poly.fit_transform(X_rfe5_mm)
lm_rfe_mm = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_mm, y)
predictions_mm['rfe5'] = lm_rfe_mm.predict(X_poly_mm)

X_poly_iqr = poly.fit_transform(X_rfe5_iqr)
lm_rfe_iqr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly_iqr, y)
predictions_iqr['rfe5'] = lm_rfe_iqr.predict(X_poly_iqr)

In [21]:
predictions_std.head()

Unnamed: 0,actual,baseline,MVP,all,rfe2,rfe3,rfe4,rfe5
88,425115,540205.597,493163.828,605164.197,690028.941,607784.171,400785.869,443579.525
4837,614179,540205.597,509488.192,607075.428,602324.61,514800.622,483635.352,425428.708
3411,466643,540205.597,588720.871,737558.46,671226.254,572495.509,596245.0,597636.937
4426,202444,540205.597,338692.11,327237.914,361889.221,344604.237,361512.483,292341.513
3969,179911,540205.597,282465.205,80547.351,247167.925,248081.207,252283.934,248067.038


In [22]:
# now set up the evaluation functions
# I want the RMSE of each model

In [34]:
# std predictions
RMSE_bl_std = model.RMSE(predictions_std.actual, predictions_std.baseline)
RMSE_MVP_std = model.RMSE(predictions_std.actual, predictions_std.MVP)
RMSE_all_std = model.RMSE(predictions_std.actual, predictions_std['all'])
RMSE_rfe2_std = model.RMSE(predictions_std.actual, predictions_std.rfe2)
RMSE_rfe3_std = model.RMSE(predictions_std.actual, predictions_std.rfe3)
RMSE_rfe4_std = model.RMSE(predictions_std.actual, predictions_std.rfe4)
RMSE_rfe5_std = model.RMSE(predictions_std.actual, predictions_std.rfe5)

# evaluation df
eval_std = pd.DataFrame(np.array(['baseline', 'MVP', 'all', 'rfe2', 'rfe3', 'rfe4', 'rfe5']), columns=['models'])
eval_std['model_errors'] = np.array([RMSE_bl_std, RMSE_MVP_std, RMSE_all_std, RMSE_rfe2_std, RMSE_rfe3_std, RMSE_rfe4_std, RMSE_rfe5_std])

In [35]:
# unf predictions
RMSE_bl_unf = model.RMSE(predictions_unf.actual, predictions_unf.baseline)
RMSE_MVP_unf = model.RMSE(predictions_unf.actual, predictions_unf.MVP)
RMSE_all_unf = model.RMSE(predictions_unf.actual, predictions_unf['all'])
RMSE_rfe2_unf = model.RMSE(predictions_unf.actual, predictions_unf.rfe2)
RMSE_rfe3_unf = model.RMSE(predictions_unf.actual, predictions_unf.rfe3)
RMSE_rfe4_unf = model.RMSE(predictions_unf.actual, predictions_unf.rfe4)
RMSE_rfe5_unf = model.RMSE(predictions_unf.actual, predictions_unf.rfe5)

# evaluation df
eval_unf = pd.DataFrame(np.array(['baseline', 'MVP', 'all', 'rfe2', 'rfe3', 'rfe4', 'rfe5']), columns=['models'])
eval_unf['model_errors'] = np.array([RMSE_bl_unf, RMSE_MVP_unf, RMSE_all_unf, RMSE_rfe2_unf, RMSE_rfe3_unf, RMSE_rfe4_unf, RMSE_rfe5_unf])

In [36]:
# gs predictions
RMSE_bl_gs = model.RMSE(predictions_gs.actual, predictions_gs.baseline)
RMSE_MVP_gs = model.RMSE(predictions_gs.actual, predictions_gs.MVP)
RMSE_all_gs = model.RMSE(predictions_gs.actual, predictions_gs['all'])
RMSE_rfe2_gs = model.RMSE(predictions_gs.actual, predictions_gs.rfe2)
RMSE_rfe3_gs = model.RMSE(predictions_gs.actual, predictions_gs.rfe3)
RMSE_rfe4_gs = model.RMSE(predictions_gs.actual, predictions_gs.rfe4)
RMSE_rfe5_gs = model.RMSE(predictions_gs.actual, predictions_gs.rfe5)

# evaluation df
eval_gs = pd.DataFrame(np.array(['baseline', 'MVP', 'all', 'rfe2', 'rfe3', 'rfe4', 'rfe5']), columns=['models'])
eval_gs['model_errors'] = np.array([RMSE_bl_gs, RMSE_MVP_gs, RMSE_all_gs, RMSE_rfe2_gs, RMSE_rfe3_gs, RMSE_rfe4_gs, RMSE_rfe5_gs])

In [37]:
# mm predictions
RMSE_bl_mm = model.RMSE(predictions_mm.actual, predictions_mm.baseline)
RMSE_MVP_mm = model.RMSE(predictions_mm.actual, predictions_mm.MVP)
RMSE_all_mm = model.RMSE(predictions_mm.actual, predictions_mm['all'])
RMSE_rfe2_mm = model.RMSE(predictions_mm.actual, predictions_mm.rfe2)
RMSE_rfe3_mm = model.RMSE(predictions_mm.actual, predictions_mm.rfe3)
RMSE_rfe4_mm = model.RMSE(predictions_mm.actual, predictions_mm.rfe4)
RMSE_rfe5_mm = model.RMSE(predictions_mm.actual, predictions_mm.rfe5)

# evaluation df
eval_mm = pd.DataFrame(np.array(['baseline', 'MVP', 'all', 'rfe2', 'rfe3', 'rfe4', 'rfe5']), columns=['models'])
eval_mm['model_errors'] = np.array([RMSE_bl_mm, RMSE_MVP_mm, RMSE_all_mm, RMSE_rfe2_mm, RMSE_rfe3_mm, RMSE_rfe4_mm, RMSE_rfe5_mm])

In [38]:
# iqr predictions
RMSE_bl_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr.baseline)
RMSE_MVP_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr.MVP)
RMSE_all_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr['all'])
RMSE_rfe2_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr.rfe2)
RMSE_rfe3_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr.rfe3)
RMSE_rfe4_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr.rfe4)
RMSE_rfe5_iqr = model.RMSE(predictions_iqr.actual, predictions_iqr.rfe5)

# evaluation df
eval_iqr = pd.DataFrame(np.array(['baseline', 'MVP', 'all', 'rfe2', 'rfe3', 'rfe4', 'rfe5']), columns=['models'])
eval_iqr['model_errors'] = np.array([RMSE_bl_iqr, RMSE_MVP_iqr, RMSE_all_iqr, RMSE_rfe2_iqr, RMSE_rfe3_iqr, RMSE_rfe4_iqr, RMSE_rfe5_iqr])

In [39]:
eval_std.sort_values(by='model_errors').head(1)

Unnamed: 0,models,model_errors
6,rfe5,437567.212


In [40]:
eval_unf.sort_values(by='model_errors').head(1)

Unnamed: 0,models,model_errors
1,MVP,549375.106


In [41]:
eval_gs.sort_values(by='model_errors').head(1)

Unnamed: 0,models,model_errors
6,rfe5,500617.814


In [42]:
eval_mm.sort_values(by='model_errors').head(1)

Unnamed: 0,models,model_errors
6,rfe5,437567.214


In [43]:
eval_iqr.sort_values(by='model_errors').head(1)

Unnamed: 0,models,model_errors
6,rfe5,437571.974


In [33]:
# The best performing scaled method is std & mm