In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import seaborn as sns
from collections import defaultdict
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
## Read the Key, logerror, transaction date set
df_parcel = pd.read_csv("c:/Users/ibipul/Documents/Python Scripts/datasets/zillow_data/train_2016_v2.csv")

In [3]:
## Read the attribute set
df_properties = pd.read_csv("c:/Users/ibipul/Documents/Python Scripts/datasets/zillow_data/properties_2016.csv",low_memory=False)

In [4]:
# Merging df_parcel, with df
dfmain = pd.merge(df_parcel,df_properties,on='parcelid')

In [5]:
dfmain.shape

(90275, 60)

In [6]:
dfmain.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016-01-01,1.0,,,2.0,3.0,,4.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016-01-01,,,,3.5,4.0,,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016-01-01,1.0,,,3.0,2.0,,4.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016-01-02,1.0,,,2.0,2.0,,4.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016-01-02,,,,2.5,4.0,,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [7]:
def get_month(transaction_date):
    s=transaction_date.split('-')
    month=int(s[1])
    return month

In [8]:
df=dfmain.copy()

In [9]:
# Adding a new column as month of transaction
df['month'] = df['transactiondate'].apply(lambda x: get_month(x))

In [10]:
## Selecting the interesting variables
data = df[['logerror','month','bathroomcnt','bedroomcnt','roomcnt','fullbathcnt','calculatedfinishedsquarefeet',
          'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt']]

In [11]:
data.dtypes

logerror                        float64
month                             int64
bathroomcnt                     float64
bedroomcnt                      float64
roomcnt                         float64
fullbathcnt                     float64
calculatedfinishedsquarefeet    float64
structuretaxvaluedollarcnt      float64
taxvaluedollarcnt               float64
landtaxvaluedollarcnt           float64
dtype: object

In [12]:
#Checking for missing data
for i in data.columns:
    print(i, sum(data[i].notnull()*1))

logerror 90275
month 90275
bathroomcnt 90275
bedroomcnt 90275
roomcnt 90275
fullbathcnt 89093
calculatedfinishedsquarefeet 89614
structuretaxvaluedollarcnt 89895
taxvaluedollarcnt 90274
landtaxvaluedollarcnt 90274


In [13]:
#Creating an imputation dictionary
imputation_dict = defaultdict()
for i in data.columns:
    x = sum(data[i].notnull()*1)
    #print(i, x)
    if (90275 -x) !=0:
        imputation_dict[i]=dfmain[i].median()

In [14]:
imputation_dict

defaultdict(None,
            {'calculatedfinishedsquarefeet': 1540.0,
             'fullbathcnt': 2.0,
             'landtaxvaluedollarcnt': 192970.0,
             'structuretaxvaluedollarcnt': 132000.0,
             'taxvaluedollarcnt': 342872.0})

In [15]:
# Imputing missing data with median values of columns
for i in data.columns:
    x = sum(data[i].isnull()*1)
    if x>0:
        data[i].replace(np.nan,imputation_dict[i],inplace=True)
        print("imputation happened for: ", i," with ",imputation_dict[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


imputation happened for:  fullbathcnt  with  2.0
imputation happened for:  calculatedfinishedsquarefeet  with  1540.0
imputation happened for:  structuretaxvaluedollarcnt  with  132000.0
imputation happened for:  taxvaluedollarcnt  with  342872.0
imputation happened for:  landtaxvaluedollarcnt  with  192970.0


In [16]:
## Creating a test-train split
train=data.sample(frac=0.8,random_state=200)
test=data.drop(train.index)

In [17]:
train.shape

(72220, 10)

In [18]:
test.shape

(18055, 10)

In [19]:
# Separating predictors and response variables
# Train set
train_Y = train['logerror']
train_X = train[['month','bathroomcnt','bedroomcnt','roomcnt','calculatedfinishedsquarefeet',
          'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt']]
#Test set
test_Y = test['logerror']
test_X = test[['month','bathroomcnt','bedroomcnt','roomcnt','calculatedfinishedsquarefeet',
          'structuretaxvaluedollarcnt','taxvaluedollarcnt','landtaxvaluedollarcnt']]

In [20]:
#Create a Linear regression model
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(train_X, train_Y)
# Make predictions using the testing set
logerror_prediction_y = regr.predict(test_X)


In [22]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %1.4f"
      % mean_squared_error(test_Y, logerror_prediction_y))
# Explained variance score: 1 is perfect prediction
print('Variance score: %1.4f' % r2_score(test_Y, logerror_prediction_y))



Coefficients: 
 [  4.39884418e-04  -1.87103774e-03  -1.03123479e-04  -1.38306998e-04
   9.28060440e-06  -4.25143140e-07   4.30888364e-07  -4.40738298e-07]
Mean squared error: 0.0261
Variance score: 0.0026
