In [1]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.formula.api import ols
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

import model
import split_scale

In [2]:
zillow = pd.read_csv('zillow_FE.csv')

In [3]:
train, test = split_scale.split_my_data(zillow, 0.8)
train.head()

Unnamed: 0.1,Unnamed: 0,index,parcelid,neighborhood,value,baths,beds,home_sf,is_extra,county,state,tax_rate,size_ratio,factor,sq_feet_proxy6
88,88,137,17129832,0,425115,3.0,6,2901,0,Ventura,CA,0.011,0.395,309.836,9.363
4837,4837,7581,17227544,0,614179,2.0,4,2402,1,Ventura,CA,0.012,0.293,309.836,7.752
3411,3411,5359,11346241,0,466643,3.0,4,2557,1,Los_Angeles,CA,0.015,0.414,309.836,8.253
4426,4426,6933,11216731,0,202444,2.0,3,1503,0,Los_Angeles,CA,0.021,0.193,309.836,4.851
3969,3969,6233,13063534,0,179911,1.0,3,1092,0,Los_Angeles,CA,0.013,0.181,309.836,3.524


In [4]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11929 entries, 88 to 13786
Data columns (total 15 columns):
Unnamed: 0        11929 non-null int64
index             11929 non-null int64
parcelid          11929 non-null int64
neighborhood      11929 non-null int64
value             11929 non-null int64
baths             11929 non-null float64
beds              11929 non-null int64
home_sf           11929 non-null int64
is_extra          11929 non-null int64
county            11929 non-null object
state             11929 non-null object
tax_rate          11929 non-null float64
size_ratio        11929 non-null float64
factor            11929 non-null float64
sq_feet_proxy6    11929 non-null float64
dtypes: float64(5), int64(8), object(2)
memory usage: 1.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2983 entries, 2145 to 3683
Data columns (total 15 columns):
Unnamed: 0        2983 non-null int64
index             2983 non-null int64
parcelid          2983 non-null int64
neighbo

(None, None)

In [5]:
# set up train features
y = train[['value']]
X = train[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]

In [6]:
# Create predictions dataframe
predictions = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions

Unnamed: 0,actual,baseline
88,425115,540205.597
4837,614179,540205.597
3411,466643,540205.597
4426,202444,540205.597
3969,179911,540205.597
...,...,...
14448,360751,540205.597
6289,343834,540205.597
9638,431817,540205.597
3021,431481,540205.597


In [38]:
# MVP model for comparison
X = train[['baths', 'beds', 'home_sf']]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y)
predictions['poly'] = lm_poly.predict(X_poly)

In [42]:
X = train[['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_all = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X, y)
predictions['all'] = lm_all.predict(X)

In [43]:
# use RFE to select top 2
model.select_rfe(X, y, 2)

['baths', 'is_extra']


In [44]:
# use RFE to select top 3
model.select_rfe(X, y, 3)

['baths', 'beds', 'is_extra']


In [45]:
# use RFE to select top 4
model.select_rfe(X, y, 4)

['baths', 'beds', 'is_extra', 'size_ratio']


In [46]:
# use RFE to select top 5
model.select_rfe(X, y, 5)

['baths', 'beds', 'is_extra', 'size_ratio', 'sq_feet_proxy6']


In [47]:
X_rfe2 = train[['baths', 'is_extra']]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_rfe2)
lm_rfe = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly, y)
predictions['rfe2'] = lm_rfe.predict(X_poly)

In [48]:
X_rfe3 = train[['baths', 'beds', 'is_extra']]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_rfe3)
lm_rfe3 = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly, y)
predictions['rfe3'] = lm_rfe3.predict(X_poly)

In [49]:
X_rfe4 = train[['baths', 'beds', 'is_extra', 'size_ratio']]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X_rfe4)
lm_rfe4 = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly, y)
predictions['rfe4'] = lm_rfe4.predict(X_poly)

In [50]:
X_rfe5 = train[['baths', 'beds', 'is_extra', 'size_ratio', 'sq_feet_proxy6']]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X_rfe5)
lm_rfe5 = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False).fit(X_poly, y)
predictions['rfe5'] = lm_rfe5.predict(X_poly)

In [51]:
predictions.head()

Unnamed: 0,actual,baseline,poly,all,rfe2,rfe3,rfe4,rfe5
88,425115,540205.597,493163.828,605164.197,601731.611,549947.186,446874.107,408103.994
4837,614179,540205.597,509488.192,607075.428,450213.369,378425.019,469910.494,442547.578
3411,466643,540205.597,588720.871,737558.46,727880.78,677751.23,683197.728,720154.274
4426,202444,540205.597,338692.11,327237.914,346348.376,360534.776,321575.049,302047.82
3969,179911,540205.597,282465.205,80547.351,300728.754,304313.877,297100.785,244705.65


In [52]:
# now set up the evaluation functions
# I want the RMSE of each model

In [53]:
RMSE_bl = model.RMSE(predictions.actual, predictions.baseline)
RMSE_poly = model.RMSE(predictions.actual, predictions.poly)
RMSE_all = model.RMSE(predictions.actual, predictions['all'])
RMSE_rfe2 = model.RMSE(predictions.actual, predictions.rfe2)
RMSE_rfe3 = model.RMSE(predictions.actual, predictions.rfe3)
RMSE_rfe4 = model.RMSE(predictions.actual, predictions.rfe4)
RMSE_rfe5 = model.RMSE(predictions.actual, predictions.rfe5)

In [54]:
eval = pd.DataFrame(np.array(['baseline', 'poly', 'all', 'rfe2', 'rfe3', 'rfe4', 'rfe5']), columns=['models'])
eval['model_errors'] = np.array([RMSE_bl, RMSE_poly, RMSE_all, RMSE_rfe2, RMSE_rfe3, RMSE_rfe4, RMSE_rfe5, ])

eval.sort_values(by='model_errors')

Unnamed: 0,models,model_errors
6,rfe5,494902.821
1,poly,549375.106
5,rfe4,577801.509
2,all,588134.613
4,rfe3,594801.105
3,rfe2,602534.384
0,baseline,745471.923


In [None]:
# This means that the model made using 5 features performs the best
# Also, all models performed better than the baseline