In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.


In [1]:
# Import Data
features = pd.read_csv("../input/properties_2016.csv", nrows=100000)
labels = pd.read_csv("../input/train_2016_v2.csv", nrows=100000)
labels.head()

# Drop properties where all features are NaNs
features.dropna(axis = 'index', how = 'all')

# Join Data
df = pd.DataFrame.merge(features,labels,on="parcelid")
df.head()

# Drop features where all properties are NaNs
df.dropna(axis = 'columns', how = 'all')


In [1]:
# Convert to Numpy Array
features = df.as_matrix(columns=["basementsqft",
                                 "bathroomcnt","bedroomcnt",
                                 "lotsizesquarefeet",
                                 "calculatedfinishedsquarefeet",
                                 "fireplacecnt",
                                 "garagecarcnt",
                                 "garagetotalsqft",
                                 "poolcnt",
                                 "poolsizesum",
                                 "unitcnt",
                                 "numberofstories",
                                 "structuretaxvaluedollarcnt",
                                 "landtaxvaluedollarcnt",
                                 "taxamount"
                                 ])
features = np.nan_to_num(features)
labels = df.as_matrix(columns=["logerror"])


In [1]:
# Split data into training and testing set
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.30, random_state=42)
#print(len(features_train))
print(len(labels_train))


In [1]:
# First look
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr = regr.fit(features_train,labels_train)
mse = mean_squared_error(regr.predict(features_test), labels_test)
print("model ceofficients: ", regr.coef_)
print("mean square error: ",mse)
print("Variance score: ",regr.score(features_test,labels_test))

# Plot outputs
#plt.scatter(features_test, labels_test,  color='black')
plt.plot(features_test, regr.predict(features_test), color='blue',
         linewidth=3)

plt.show()


In [1]:
# Create pipeline for analysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
#from sklearn.feature_selection import SelectKBest
#from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import (LinearRegression, TheilSenRegressor, 
                                  RANSACRegressor, HuberRegressor)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

estimators = [('OLS', LinearRegression()),
              ('Theil-Sen', TheilSenRegressor(random_state=42)),
              ('RANSAC', RANSACRegressor(random_state=42)),
              ('HuberRegressor', HuberRegressor())]

pipe = make_pipeline(PolynomialFeatures(3), LinearRegression())
#print(pipe.steps[1])

params = dict(polynomialfeatures__degree = range(3, 4))

cv = StratifiedShuffleSplit(n_splits=10, test_size = 0.1, random_state = 42)
gs = GridSearchCV(pipe, param_grid=params, cv=cv, scoring='f1')


In [1]:
# What do the labels look like? Look pretty normally distributed. Maybe one outlier.
plt.hist(labels_train, bins=100)


In [1]:
# Define classifier
gs = gs.fit(features_train,labels_train)
clf = gs.best_estimator_
