In [2]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingRegressor
from pyearth import Earth

import gc; gc.enable()

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('Cleaned_Housing_Data_vDL.csv')

In [4]:
target = 'price'


In [5]:
real_cols = ['sqft_living', 'lat', 'long', 'year']
cat_cols = ['bedrooms', 'bathrooms', 'zipcode']
used_cols =  real_cols + cat_cols

In [6]:
X, y = df[used_cols], df[target]

In [7]:
from pygam.terms import s as spline
from pygam.terms import f as factor
from pygam.terms import te as tensor


# DEFINE TERMS:
#   * splines - for continuous
#   * factors - for categorical/discrete 
#        (assumption: label encoded w/ 0 to level_size-1)
#   * tensors - for any interactions

# create the term list
term_list = []
for i, col in enumerate(used_cols):
    if col in real_cols:
        term_list.append(spline(i))

# add the x1 and x2 interaction term
term_list.append(tensor(1, 2))
term_list.append(tensor(0, 1, 2))
term_list.append(tensor(4, 5))
term_list.append(tensor(1, 2, 6))



# term_list.append(tensor(7, 8, 10))
# term_list.append(tensor(0, 4))
# term_list.append(tensor(0, 1, 2, 3))




# factors
for i, col in enumerate(used_cols):
    if col in cat_cols:
        term_list.append(factor(i))


# create the terms and model
terms = np.sum(term_list)
print(terms)
# gam = LinearGAM(terms=terms).gridsearch(X.values, y.values)
# gam.summary()

s(0) + s(1) + s(2) + s(3) + te(1, 2) + te(0, 1, 2) + te(4, 5) + te(1, 2, 6) + f(4) + f(5) + f(6)


In [8]:
# Playing with distribution assumptions and link functions
from pygam import GAM


gam = GAM(terms=terms, distribution='gamma', link='log')
gam.gridsearch(X.values, y.values)
gam.summary()

100% (11 of 11) |########################| Elapsed Time: 0:20:28 Time:  0:20:28


GAM                                                                                                       
Distribution:                         GammaDist Effective DoF:                                    416.2996
Link Function:                          LogLink Log Likelihood:                               -272396.8467
Number of Samples:                        21263 AIC:                                           545628.2925
                                                AICc:                                          545645.0408
                                                GCV:                                                0.0386
                                                Scale:                                              0.0397
                                                Pseudo R-Squared:                                   0.8773
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [0.

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0, train_size = .7)
# Define model
# Fit model
# gam.fit(train_X, train_y)
#get predicted prices on validation data
val_predictions = gam.predict(val_X)
train_predictions = gam.predict(train_X)
print("mean absolute error train: " , mean_absolute_error(train_y, train_predictions))
print("mean absolute error test: " , mean_absolute_error(val_y, val_predictions))
print("r2 predict train: " , r2_score(train_y, train_predictions))
print("r2 predict test: " , r2_score(val_y, val_predictions))
#print(cross_val_score(gam, train_X, train_y, cv=3, scoring = 'r2'))


mean absolute error train:  77393.24177901633
mean absolute error test:  78735.50474902311
r2 predict train:  0.8450948318151117
r2 predict test:  0.8452833620283181


In [10]:
df = pd.read_csv('Zillow_KC_listings.csv')

In [11]:
df['year'] = 2019

In [12]:
X, y = df[used_cols], df[target]

In [13]:
y_preds = gam.predict(X)

In [14]:
print("mean absolute error test: " , mean_absolute_error(y_preds, df[target]))
print("r2 predict test: " , r2_score(y_preds, df[target])

mean absolute error test:  75861.96433087518
r2 predict test:  0.8995228089915434


In [16]:
from sklearn.externals import joblib

In [17]:
filename = 'low_col_gams.sav'
joblib.dump(gam, filename)

['low_col_gams.sav']