In [1]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from statsmodels.formula.api import ols
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

import model
import split_scale

In [2]:
zillow = pd.read_csv('zillow_FE.csv')

In [None]:
#with pd.option_context('mode.use_inf_as_na', True):
#    zillow = zillow.dropna(subset=['home_sf'], how='all')

In [3]:
train, test = split_scale.split_my_data(zillow, 0.8)
train.head()

Unnamed: 0.1,Unnamed: 0,parcelid,neighborhood,value,baths,beds,home_sf,is_extra,county,state,tax_rate,size_ratio
88,88,14195073,0,921394,3.0,4,3099,1,Orange,CA,0.012,0.344
4837,4871,11272452,0,156553,2.0,3,1880,0,Los_Angeles,CA,0.02,0.265
3411,3436,12938788,0,57109,1.0,3,1056,0,Los_Angeles,CA,0.016,0.142
4426,4459,14051836,0,665750,3.0,4,1883,0,Orange,CA,0.011,0.308
3969,3999,10993868,27484,750269,3.0,4,3551,0,Los_Angeles,CA,0.012,0.165


In [4]:
zillow.isna().sum()

Unnamed: 0      0
parcelid        0
neighborhood    0
value           0
baths           0
beds            0
home_sf         0
is_extra        0
county          0
state           0
tax_rate        0
size_ratio      0
dtype: int64

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11929 entries, 88 to 13786
Data columns (total 12 columns):
Unnamed: 0      11929 non-null int64
parcelid        11929 non-null int64
neighborhood    11929 non-null int64
value           11929 non-null int64
baths           11929 non-null float64
beds            11929 non-null int64
home_sf         11929 non-null int64
is_extra        11929 non-null int64
county          11929 non-null object
state           11929 non-null object
tax_rate        11929 non-null float64
size_ratio      11929 non-null float64
dtypes: float64(3), int64(7), object(2)
memory usage: 1.2+ MB


In [6]:
# We are trying to predict Value based on baths, beds and sq. ft
y = train[['value']]
X = train[['neighborhood', 'baths', 'beds', 'home_sf', 'is_extra', 'size_ratio']]

In [9]:
# Create predictions dataframe
predictions = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions

Unnamed: 0,actual,baseline
88,921394,538422.475
4837,156553,538422.475
3411,57109,538422.475
4426,665750,538422.475
3969,750269,538422.475
...,...,...
14448,322000,538422.475
6289,544000,538422.475
9638,2191923,538422.475
3021,105193,538422.475


In [10]:
# MVP model for comparison
X_poly = train[['baths', 'beds', 'home_sf']]

lm_poly = LinearRegression(normalize=False).fit(X_poly, y)
predictions['poly'] = lm_poly.predict(X_poly)

In [11]:
lm_poly2 = LinearRegression(normalize=False).fit(X, y)
predictions['poly2'] = lm_poly2.predict(X)

In [12]:
# use RFE to select top 2
model.select_rfe(X, y, 2)

['baths', 'is_extra']


In [14]:
# use RFE to select top 3
model.select_rfe(X, y, 3)

['baths', 'is_extra', 'size_ratio']


In [15]:
# use RFE to select top 4
model.select_rfe(X, y, 4)

['baths', 'beds', 'is_extra', 'size_ratio']


In [16]:
# use RFE to select top 5
model.select_rfe(X, y, 5)

['baths', 'beds', 'home_sf', 'is_extra', 'size_ratio']


In [17]:
X_rfe2 = train[['baths', 'is_extra']]

lm_rfe = LinearRegression(normalize=False).fit(X_rfe2, y)
predictions['rfe2'] = lm_rfe.predict(X_rfe2)

In [18]:
X_rfe3 = train[['baths', 'is_extra', 'size_ratio']]

lm_rfe3 = LinearRegression(normalize=False).fit(X_rfe3, y)
predictions['rfe3'] = lm_rfe3.predict(X_rfe3)

In [None]:
X_rfe4 = train[['baths', 'beds', 'is_extra', 'size_ratio']]

lm_rfe4 = LinearRegression(normalize=False).fit(X_rfe4, y)
predictions['rfe4'] = lm_rfe4.predict(X_rfe4)

In [None]:
X_rfe4 = train[['baths', 'beds', 'is_extra', 'size_ratio']]

lm_rfe4 = LinearRegression(normalize=False).fit(X_rfe4, y)
predictions['rfe4'] = lm_rfe4.predict(X_rfe4)

In [None]:
# But given the correlations, I think sq feet is more important than bedrooms
X_corr = train[['bathrooms', 'square_feet']]

lm_corr = LinearRegression(normalize=False).fit(X_corr, y)
predictions['corr'] = lm_corr.predict(X_corr)

In [None]:
# make an ols model with all 3
ols_model = ols('value ~ bathrooms + bedrooms + square_feet', data=train).fit()
predictions['ols'] = ols_model.predict(X)

In [None]:
predictions

In [None]:
# now set up the evaluation functions
# I want the RMSE of each model

In [None]:
RMSE_bl = model.RMSE(predictions.actual, predictions.baseline)
RMSE_bath = model.RMSE(predictions.actual, predictions.bath_lm)
RMSE_bed = model.RMSE(predictions.actual, predictions.bed_lm)
RMSE_sf = model.RMSE(predictions.actual, predictions.sf_lm)
RMSE_poly = model.RMSE(predictions.actual, predictions.poly)
RMSE_rfe = model.RMSE(predictions.actual, predictions.rfe)
RMSE_corr = model.RMSE(predictions.actual, predictions['corr'])
RMSE_ols = model.RMSE(predictions.actual, predictions.ols)

In [None]:
eval = pd.DataFrame(np.array(['baseline', 'bath', 'bed', 'sq_feet', 'poly', 'rfe', 'corr', 'ols']), columns=['models'])
eval['model_errors'] = np.array([RMSE_bl, RMSE_bath, RMSE_bed, RMSE_sf, RMSE_poly, RMSE_rfe, RMSE_corr, RMSE_ols])

eval.sort_values(by='model_errors')

In [None]:
# This means that the model made using all 3 values performs the best
# Also, all models performed better than the baseline

In [None]:
# plot residuals
plt.figure(figsize=(9, 9))

sns.residplot(predictions.actual, predictions.bath_lm, color='pink')
sns.residplot(predictions.actual, predictions.bed_lm, color='yellow')
sns.residplot(predictions.actual, predictions.sf_lm, color='red')

In [None]:
# plot residuals
plt.figure(figsize=(9, 9))

sns.residplot(predictions.actual, predictions.poly, color='blue')
sns.residplot(predictions.actual, predictions.rfe, color='green')
sns.residplot(predictions.actual, predictions['corr'], color='purple')
sns.residplot(predictions.actual, predictions.ols, color='orange')