In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

import os, sys

import gzip
import cPickle as pickle

import calendar

import matplotlib
% matplotlib inline

In [2]:
in_dir = "/home/data/kaggle-zillow/"
out_dir= in_dir + "processed"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Read in and format data for analysis

In [3]:
# read in raw data

print( "\nReading data from disk ...")
prop = pd.read_csv(in_dir + 'properties_2016.csv')
train = pd.read_csv(in_dir + "train_2016_v2.csv", \
                    parse_dates=["transactiondate"])
sampl = pd.read_csv(in_dir + "sample_submission.csv").rename(
            columns={"ParcelId":"parcelid"})


Reading data from disk ...


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
train.shape, prop.shape, sampl.shape

((90275, 3), (2985217, 58), (2985217, 7))

The only variation in the columns of the test data is across months. So it would make sense to make 6 separate predictions, in which all that changes in the feature matrix is the month column. 

In [5]:
print "\n Reading in variable info ..."

xl = pd.ExcelFile(in_dir + 'zillow_data_dictionary.xlsx')
xl.sheet_names  # see all sheet names

vars_dict = {}

for sheet in xl.sheet_names:
    df = xl.parse(sheet)
    vars_dict[sheet.lower()] = dict(zip(df.iloc[:,0], df.iloc[:,1]))
    
# save dictionary to disc
with gzip.open(out_dir + "/vars_dict.pickle.gz", "w") as f:
    pickle.dump(vars_dict, f)


 Reading in variable info ...


In [10]:
for c in ['propertyzoningdesc', 'propertycountylandusecode', 
                 'censustractandblock', 'rawcensustractandblock']:
    print c, prop[c].nunique(), prop[c].dtype

 propertyzoningdesc 5638 object
propertycountylandusecode 240 object
censustractandblock 96771 float64
rawcensustractandblock 99393 float64


In [38]:
def format_data(df, retain_cols=None):
    train_df = df.copy()
    if retain_cols is not None:
        train_df = train_df[[c for c in train_df.columns if c in retain_cols]]

    # remove variables with too many categories
    # TODO: aggregate each variable into a small number of "clusters" 
    # wrt value of target variable (logerror)
    drop_cols = ['censustractandblock', 'rawcensustractandblock']
    train_df.drop([c for c in train_df.columns if c in drop_cols], 
                  axis=1, inplace=True)            

    # remove variables with no variation
    tr_std = train_df.std()
    train_df.drop(tr_std[tr_std==0].index, axis=1, inplace=True)

    # format categorical variables

    train_df['taxdelinquencyflag'] = train_df['taxdelinquencyflag'].apply(lambda x: 1 if x=='Y' else 0)
    train_df['fireplaceflag'] = train_df['fireplaceflag'].apply(lambda x: 1 if x==True else 0)
    train_df['hashottuborspa'] = train_df['hashottuborspa'].apply(lambda x: 1 if x==True else 0)

    categ_cols = ['parcelid'] + [c for c in train_df.columns 
                      if c in vars_dict.keys() or train_df[c].dtype==np.dtype('O') ]
    
    # some categorical variables are actually codes (FIPS, ZIP, Census Blocks, etc)
    # the problem is that some of these codes are very numerous!
    # perhaps just remove them?
    categ_cols += ['propertyzoningdesc', 'propertycountylandusecode',
                   'fips', 'regionidcity', 'regionidcounty',
                   'regionidneighborhood', 'regionidzip']

    # format date fields
    if 'transactiondate' in train_df.columns:
        train_df['transactionmonth'] = train_df['transactiondate'].apply(\
                                                        lambda x: x.month)
        categ_cols += ['transactionmonth']
        train_df.drop("transactiondate", axis=1, inplace=True)
    categ_cols = list(set(categ_cols))
    
    for c in categ_cols:
        if train_df[c].dtype != np.dtype('O'):
            train_df[c] = train_df[c].apply(lambda x: 0 if np.isnan(x) else int(x)).astype(str)
        else:
            levels = train_df[c].unique()
            levels_dict = dict(zip(levels, range(len(levels))))
            train_df[c] = train_df[c].apply(lambda x: 0 if pd.isnull(x) else levels_dict[x]).astype(str)
        
    # fill missing values for some variables with 0
    # it makes more sense that for attributes like pools and fireplaces,
    # which may not be populated because they don't exist

    vars_fill_zero = ['poolsizesum', 'basementsqft', 'fireplacecnt', 
                      'fullbathcnt', 'garagecarcnt', 'garagetotalsqft',
                      'calculatedbathnbr', 'finishedfloor1squarefeet',
                      'finishedsquarefeet13', 'finishedsquarefeet15', 
                      'finishedsquarefeet50', 'finishedsquarefeet6', 
                      'threequarterbathnbr', 'yardbuildingsqft17',
                      'yardbuildingsqft26'
                      ]
    for c in vars_fill_zero:
        train_df[c] = train_df[c].fillna(0)

    # replace missing values for other variables with column medians
    median_values = train_df.median(axis=0)
    train_df = train_df.fillna(median_values, inplace=True)

    # years from 2016 of property being tax delinquent 
    train_df['taxdelinquencyyear'] = train_df['taxdelinquencyyear'].apply(\
                                        lambda x: 2016-(2000+x if x < 16 else 1900+x))
    
    return train_df

In [48]:
# join in property and score data
train_df = pd.merge(train, prop, on='parcelid', how='left')

# format training data
train_df = format_data(train_df)

train_df.to_csv(out_dir + "/train_formatted.csv", index=False)

# train - eval split
y = train_df['logerror'].values
train_df.drop(["logerror", "parcelid"], axis=1, inplace=True)
features= train_df.columns.values.tolist()
categ_vars = train_df.columns[train_df.dtypes == object].values.tolist()
X = train_df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size=0.01, random_state=42)

with gzip.open(out_dir + "/train_formatted.pickle.gz", "w") as f:
    pickle.dump({'X_train' : X_train,
                 'X_test'  : X_test,
                 'y_train' : y_train,
                 'y_test'  : y_test,
                 'features': features,
                 'categ_vars': categ_vars},
               f)

In [41]:
for c in train_df.columns:
    if train_df[c].dtype == object:
        print c, train_df[c].nunique()

airconditioningtypeid 7
architecturalstyletypeid 7
fips 3
heatingorsystemtypeid 13
propertycountylandusecode 77
propertylandusetypeid 14
propertyzoningdesc 1996
regionidcity 178
regionidcounty 3
regionidneighborhood 495
regionidzip 389
typeconstructiontypeid 4
transactionmonth 12


In [44]:
# format test data
smpl_cols = ["201610", "201611", "201612", "201710", "201711", "201712"]
test_df = pd.merge(sampl, prop, on='parcelid', how='left').drop(smpl_cols, axis=1)

test_df = format_data(test_df, retain_cols=['parcelid'] + features)

test_df.to_csv(out_dir + "/sample_formatted.csv", index=False)

In [35]:
parcelids = test_df['parcelid'].values
test_df.drop(["parcelid"], axis=1, inplace=True)
features= test_df.columns.values.tolist()
categ_vars = test_df.columns[test_df.dtypes == object].values.tolist()
X = test_df.values

with gzip.open(out_dir + "/sample_formatted.pickle.gz", "w") as f:
    pickle.dump({'X_submit' : X,
                 'features': features,
                 'parcelid': parcelids,
                 'categ_vars': categ_vars},
               f)

KeyboardInterrupt: 

In [None]:
X_train.shape

In [None]:
X.shape

In [None]:
len(features)

In [None]:
set(train_df.columns) - set(features)