# Creating a Classifier for the Zillow Data
Given a set of features that include housing-related data and a matching set of training data consisting of standard error of the Zestimate, predict the log of the standard error.


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


In [1]:
# Import Data
features = pd.read_csv("../input/properties_2016.csv", nrows=100000)
labels = pd.read_csv("../input/train_2016_v2.csv", nrows=100000, parse_dates=True)
labels.head()

# Drop properties where all features are NaNs
features.dropna(axis = 'index', how = 'all')

# Join Data
df = pd.DataFrame.merge(features,labels,on="parcelid")
df.head()

# Drop features where all properties are NaNs
df.dropna(axis = 'columns', how = 'all')


## Convert Months to Dummy Variables
Convert the discrete variable transaction data months to dummy variables to use as a feature and assist in the output of data by months.


In [1]:
# Add discrete variables
from datetime import date

# Convert date to month
df["transactiondate"] = pd.to_datetime(df["transactiondate"], format = '%Y-%m-%d')
df["month"] = df["transactiondate"].dt.month

# Convert months to dummy variables
month_dummy_variables = pd.get_dummies(data=df["month"],prefix="month")
df = pd.DataFrame.merge(df,month_dummy_variables,left_index=True,right_index=True)
df.head()


In [1]:
# Convert to Numpy Array
features = df.as_matrix(columns=["basementsqft",
                                 "bathroomcnt","bedroomcnt",
                                 "lotsizesquarefeet",
                                 "calculatedfinishedsquarefeet",
                                 "fireplacecnt",
                                 "garagecarcnt",
                                 "garagetotalsqft",
                                 "poolcnt",
                                 "poolsizesum",
                                 "unitcnt",
                                 "numberofstories",
                                 "structuretaxvaluedollarcnt",
                                 "landtaxvaluedollarcnt",
                                 "taxamount",
                                 "latitude",
                                 "longitude",
                                 "month",
                                 ])
features = np.nan_to_num(features)
labels = df.as_matrix(columns=["logerror"])


In [1]:
# Split data into training and testing set for 10/2016, 11/2016, and 12/2016
from sklearn.model_selection import train_test_split

# 11/2016
#features_102016 = []
#labels_102016 = []
#count = 0
#for row in features:
#    if row[17] == 10:
#        features_102016.append(row)
#        labels_102016.append(labels[count])
#    count += 1
#features_102016_train, features_102016_test, labels_102016_train, labels_102016_test = \
#    train_test_split(features_102016, labels_102016, test_size=0.30, random_state=42)

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.30, random_state=42)

print(len(features_train))
print(len(labels_train))


In [1]:
# First look
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

regr = LinearRegression()
regr = regr.fit(features_train,labels_train)


In [1]:
# First look
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

regr = LinearRegression()
regr = regr.fit(features_train,labels_train)


## Archived Code


In [1]:
# Evaluation Metrics
mse = mean_squared_error(regr.predict(features_test), labels_test)
print("model ceofficients: ", regr.coef_)
print("mean square error: ",mse)
print("Variance score: ",regr.score(features_test,labels_test))
sd = np.subtract(regr.predict(features_test),labels_test)
plt.hist(sd,bins=100)

# Plot outputs
#plt.scatter(features_test, labels_test,  color='black')
#plt.plot(features_test, regr.predict(features_test), color='blue',
#         linewidth=3)

plt.show()


In [1]:
# Create pipeline for analysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
#from sklearn.feature_selection import SelectKBest
#from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import (LinearRegression, TheilSenRegressor, 
                                  RANSACRegressor, HuberRegressor)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

estimators = [('OLS', LinearRegression()),
              ('Theil-Sen', TheilSenRegressor(random_state=42)),
              ('RANSAC', RANSACRegressor(random_state=42)),
              ('HuberRegressor', HuberRegressor())]

pipe = make_pipeline(PolynomialFeatures(3), LinearRegression())
#print(pipe.steps[1])

params = dict(polynomialfeatures__degree = range(3, 4))

cv = StratifiedShuffleSplit(n_splits=10, test_size = 0.1, random_state = 42)
gs = GridSearchCV(pipe, param_grid=params, cv=cv, scoring='f1')
