In [1]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingRegressor
from pyearth import Earth

import gc; gc.enable()

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Cleaned_Housing_Data_vDL.csv')

In [3]:
target = 'price'


In [4]:
real_cols = ['sqft_living', 'lat', 'long',  'sqft_lot', 'year']
cat_cols = ['bedrooms', 'bathrooms', 'grade', 'view', 'condition', 'month', 'yr_built', 'zipcode', 'yr_renovated']
used_cols =  real_cols + cat_cols
# used_cols = [c for c in df.columns.tolist() if c not in [target]]

In [5]:
X, y = df[used_cols], df[target]

In [6]:
df.columns

Index(['Unnamed: 0', 'id', 'date', 'price', 'bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition',
       'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'year',
       'month'],
      dtype='object')

In [7]:
from pygam.terms import s as spline
from pygam.terms import f as factor
from pygam.terms import te as tensor


# DEFINE TERMS:
#   * splines - for continuous
#   * factors - for categorical/discrete 
#        (assumption: label encoded w/ 0 to level_size-1)
#   * tensors - for any interactions

# create the term list
term_list = []
for i, col in enumerate(used_cols):
    if col in real_cols:
        term_list.append(spline(i))

# add the x1 and x2 interaction term
term_list.append(tensor(1, 2))
term_list.append(tensor(0, 1, 2))
term_list.append(tensor(5, 6))
term_list.append(tensor(1, 2, 12))
# term_list.append(tensor(7, 8, 10))
# term_list.append(tensor(0, 4))
# term_list.append(tensor(0, 1, 2, 3))




# factors
for i, col in enumerate(used_cols):
    if col in cat_cols:
        term_list.append(factor(i))


# create the terms and model
terms = np.sum(term_list)
print(terms)
gc.collect
# gam = LinearGAM(terms=terms).gridsearch(X.values, y.values)
# gam.summary()

s(0) + s(1) + s(2) + s(3) + s(4) + te(1, 2) + te(0, 1, 2) + te(5, 6) + te(1, 2, 12) + f(5) + f(6) + f(7) + f(8) + f(9) + f(10) + f(11) + f(12) + f(13)


<function gc.collect>

In [None]:
# Playing with distribution assumptions and link functions
from pygam import GAM


gam = GAM(terms=terms, distribution='gamma', link='log')
gam.gridsearch(X.values, y.values)
gam.summary()

 90% (10 of 11) |#####################   | Elapsed Time: 0:10:41 ETA:   0:01:05

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0, train_size = .7)
# Define model
# Fit model
# gam.fit(train_X, train_y)
#get predicted prices on validation data
val_predictions = gam.predict(val_X)
train_predictions = gam.predict(train_X)
print("mean absolute error train: " , mean_absolute_error(train_y, train_predictions))
print("mean absolute error test: " , mean_absolute_error(val_y, val_predictions))
print("r2 predict train: " , r2_score(train_y, train_predictions))
print("r2 predict test: " , r2_score(val_y, val_predictions))
#print(cross_val_score(gam, train_X, train_y, cv=3, scoring = 'r2'))


mean absolute error train:  67701.13973314365
mean absolute error test:  68310.64716546224
r2 predict train:  0.8818973591337486
r2 predict test:  0.8788013603039673


In [17]:
from sklearn.externals import joblib

In [18]:
filename = 'all_col_gams.sav'
joblib.dump(gam, filename)

['all_col_gams.sav']