# Redfin Model Prediction

In [90]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingRegressor
from pyearth import Earth

import gc; gc.enable()

import warnings
warnings.filterwarnings("ignore")

Importing currently listed Redfin homes (unsold)

In [91]:
df = pd.read_csv('Redfin_KC_listings.csv')

In [92]:
df.describe()

Unnamed: 0.1,Unnamed: 0,zipcode,price,bedrooms,bathrooms,sqft_living,yr_built,lat,long
count,350.0,350.0,350.0,350.0,309.0,311.0,311.0,350.0,350.0
mean,174.5,98080.645714,726484.3,2.997143,2.313107,2101.21865,1987.990354,47.520318,-122.226675
std,101.180532,107.563003,567981.6,1.676568,0.901648,1023.091424,30.276626,0.14703,0.133173
min,0.0,98001.0,35000.0,0.0,0.75,366.0,1904.0,47.192778,-122.515085
25%,87.25,98030.0,438237.5,2.0,1.75,1254.5,1968.0,47.385974,-122.322955
50%,174.5,98065.0,607474.5,3.0,2.25,2030.0,1994.0,47.541891,-122.265448
75%,261.75,98117.75,799987.5,4.0,2.75,2792.0,2019.0,47.636744,-122.158429
max,349.0,99816.0,4498000.0,13.0,6.0,7435.0,2021.0,47.778571,-121.735239


In [93]:
df.isna().sum()

Unnamed: 0                                                                                     0
ADDRESS                                                                                        0
zipcode                                                                                        0
price                                                                                          0
bedrooms                                                                                       0
bathrooms                                                                                     41
sqft_living                                                                                   39
yr_built                                                                                      39
URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)     0
lat                                                                                            0
long                          

In [94]:
df.dropna(inplace=True)

In [95]:
df['year'] = 2019

In [96]:
df = df[(df['bedrooms'] <= 5) & (df['bedrooms'] >= 1)]

In [97]:
df = df[df['zipcode'] <= 98199.0]

In [98]:
target = 'price'


In [99]:
real_cols = ['sqft_living', 'lat', 'long', 'year']
cat_cols = ['bedrooms', 'bathrooms', 'zipcode']
used_cols =  real_cols + cat_cols

In [100]:
X, y = df[used_cols], df[target]

In [101]:
from pygam.terms import s as spline
from pygam.terms import f as factor
from pygam.terms import te as tensor


# DEFINE TERMS:
#   * splines - for continuous
#   * factors - for categorical/discrete 
#        (assumption: label encoded w/ 0 to level_size-1)
#   * tensors - for any interactions

# create the term list
term_list = []
for i, col in enumerate(used_cols):
    if col in real_cols:
        term_list.append(spline(i))

# add the x1 and x2 interaction term
term_list.append(tensor(1, 2))
term_list.append(tensor(0, 1, 2))
term_list.append(tensor(4, 5))
term_list.append(tensor(1, 2, 6))



# term_list.append(tensor(7, 8, 10))
# term_list.append(tensor(0, 4))
# term_list.append(tensor(0, 1, 2, 3))




# factors
for i, col in enumerate(used_cols):
    if col in cat_cols:
        term_list.append(factor(i))


# create the terms and model
terms = np.sum(term_list)
print(terms)
# gam = LinearGAM(terms=terms).gridsearch(X.values, y.values)
# gam.summary()


s(0) + s(1) + s(2) + s(3) + te(1, 2) + te(0, 1, 2) + te(4, 5) + te(1, 2, 6) + f(4) + f(5) + f(6)


In [102]:
from sklearn.externals import joblib

In [103]:
filename = 'low_col_gams.sav'
gam = joblib.load(filename)
y_preds = gam.predict(X)


In [104]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print("mean absolute error test: " , mean_absolute_error(y_preds, df[target]))
print("r2 predict test: " , r2_score(y_preds, df[target]))

mean absolute error test:  138176.49551521073
r2 predict test:  0.8514537117667731


In [105]:
df['predict'] = y_preds

In [106]:
df['predicton_diff'] = df['price'] - df['predict']

In [111]:
df2 = df[df['predicton_diff'] < 0].sort_values(by='predicton_diff', ascending=False)

In [82]:
df.iloc[66]

Unnamed: 0        7.100000e+01
zipcode           9.807500e+04
price             1.550000e+06
bedrooms          5.000000e+00
bathrooms         4.000000e+00
sqft_living       4.006000e+03
yr_built          2.020000e+03
lat               4.760035e+01
long             -1.219883e+02
year              2.019000e+03
predict           1.365825e+06
predicton_diff    1.841755e+05
Name: 71, dtype: float64

In [113]:
# df2.to_csv('price_diff.csv')