In [None]:
def convert_type(df):
    """house properties include yes/no flags such as "hashottuborspa" (whether the house has hot
       tub or spa). Convert these fields to boolean. Note that the raw values are "yes" against 
       NaN. NaN's are converted to False. 
    """
    df['latitude'] = df['latitude'] / 1e6
    df['longitude'] = df['longitude'] / 1e6
    
    df['tax_rate'] = df['taxamount'] / df['taxvaluedollarcnt']
    
    if len(missing_data_features_set) > 0:
        df = df.drop(missing_data_features_set, axis=1)
    
    if len(flag_features) > 0:
        for col in flag_features:
            df[col] = df[col].fillna(value=False).apply(bool).copy()

    return df

In [None]:
X_labeled_train = df_labeled_train.drop(['logerror', 'transactiondate'], axis=1)
X_labeled_val = df_labeled_val.drop(['logerror', 'transactiondate'], axis=1)

In [None]:
X_labeled_train = convert_type(X_labeled_train)
X_no_hash_labeled_train = X_labeled_train.copy()

# this is for CatBoost, NaN is now a category in these features
X_no_hash_labeled_train[list(categorical_features_names)] = X_no_hash_labeled_train[list(categorical_features_names)]\
                                                            .astype(str)
X_no_hash_labeled_train.to_pickle(working_dir + 'X_no_hash_labeled_train.pkl')

X_labeled_val = convert_type(X_labeled_val)
X_no_hash_labeled_val = X_labeled_val.copy()
X_no_hash_labeled_val[list(categorical_features_names)] = X_no_hash_labeled_val[list(categorical_features_names)]\
                                                          .astype(str)
X_no_hash_labeled_val.to_pickle(working_dir + 'X_no_hash_labeled_val.pkl')

# this for feature engineering, add y variables to dataframe
pd.concat([X_no_hash_labeled_train, df_labeled_train[['logerror', 'transactiondate']]], axis=1)\
.to_pickle(working_dir + 'df_no_hash_labeled_clean_train.pkl')

## Hash Categorical Features ##
The data dictionary clarifies which features are categorical. Now process the training data with the hashing trick. The goal of this block of code is to output a sparse matrix of the transformed features and save it to disk. 

In [None]:
def hash_features(df):
    """
    Feature-hash all the categorical features to a sparse matrix.
    The continuous features form a dense matrix; stack this dense matrix with the sparse matrix.
    Finally it returns a scipy csr_matrix.
    """
  
    n_features = 2**20
    D = df.filter(items=categorical_features_names).to_dict(orient='records')
    hash_X = FeatureHasher(n_features=n_features).transform(D)
    del n_features, D
    gc.collect()
    
    X_all = scipy.sparse.hstack((df.filter(items=continuous_features).values.astype('float'), hash_X))
    return X_all

scipy.sparse.save_npz(working_dir+"X_hashed_remove_miss_labeled_train.npz", hash_features(X_labeled_train))
scipy.sparse.save_npz(working_dir+"X_hashed_remove_miss_labeled_val.npz", hash_features(X_labeled_val))

del X_labeled_train, X_labeled_val, X_no_hash_labeled_train, X_no_hash_labeled_val
gc.collect()

## Apply Process to Test Data ##

For the submission we are asked to predict prices for 6 time points for all properties. I process these time points separately. The same process done to the training data is applied to the test data. 

In [None]:
def make_test_matrix(target_df, year, month, add_hash_features=False):
    
    dft = eval("df_properties_" + year + "[df_properties_" + year + ".index.isin(target_df.index)].copy()")
    # above evaluates to this expression:
    # dft = df_properties_2016[df_properties_2016.index.isin(target_df.index)].copy()
    
    gc.collect()
    
    dft['transaction_year'] = int(year)
    dft['transaction_month'] = int(month)

    df_no_hash = convert_type(dft)
    exec("df_no_hash.to_pickle(working_dir+'X_no_hash_test_" + year + month + ".pkl')")
    # above evaluates to this expression:
    # dft.to_pickle(working_dir+'X_no_hash_test_201610.pkl')
        
    if add_hash_features == True:
        features_mat = hash_features(df_no_hash)
        exec("scipy.sparse.save_npz(working_dir+'X_test_all_" + year + month + "_" + str(subset_num) \
             + ".npz', features_mat)")
        # above evaluates to this expression:
        # scipy.sparse.save_npz(working_dir+'X_test_all_201610_0.npz', features_mat)
    
    del dft

    try:
        del df_no_hash
    except NameError: 
        pass
    
    try:
        del features_mat
    except NameError: 
        pass
    
    gc.collect()

### Load Test Data ###

In [None]:
sample_submission = pd.read_csv(working_dir+'sample_submission.csv', header=0, index_col=0)

### Apply Data Manipulation without Hashing to Test Data ###

In [None]:
for year in ('2016', '2017'):
    for month in ('10', '11', '12'):
        make_test_matrix(sample_submission, year, month, False)

### Apply Hashing to Test Data ###

## Load Data ##

In [None]:
working_dir = "/home/lee/Documents/Datasets for GitHub/kaggle_zillow_home_value_prediction/"

df_properties_2016 = pd.read_csv(working_dir+'properties_2016.csv', header=0, index_col=0, low_memory=False)
df_transaction_2016 = pd.read_csv(working_dir+'train_2016_v2.csv', header=0, index_col=0, low_memory=False)

df_properties_2017 = pd.read_csv(working_dir+'properties_2017.csv', header=0, index_col=0, low_memory=False)
df_transaction_2017 = pd.read_csv(working_dir+'train_2017.csv', header=0, index_col=0, low_memory=False)

gc.collect()

Output is very long; comment out after viewing data. 

In [None]:
def look_at_dataset(df):

    print("dataframe shape: {}".format(df.shape))
    print("\n")

    print("preview data: \n")
    for i in list(range(0, len(df.columns), 8)):
        print(df.iloc[0:5, i:i+8])
    print("\n")

    print("summarize data: \n")
    for i in list(range(0, len(df.columns), 8)):
        print(df.iloc[:, i:i+8].describe())

# look_at_dataset(df_properties_2016)
# look_at_dataset(df_transaction_2016)
# look_at_dataset(df_properties_2017)
# look_at_dataset(df_transaction_2017)

## Merge Dataframes ##

In [None]:
def join_create_date(year):
    """
    join transaction and properties data
    convert transaction date field to datetime
    extract transaction year from transaction date
    extract transaction month from transaction date
    """
    df = eval("df_transaction_" + year + ".join(df_properties_" + year + ", how='left')")
    # above evaluates to this expression:
    # df = df_transaction_2017.join(df_properties_2017, how='left')
    
    df['transactiondate'] = df['transactiondate'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    df['transaction_year'] = df['transactiondate'].dt.year
    df['transaction_month'] = df['transactiondate'].dt.month

    return df

df_train_2016 = join_create_date('2016')
df_train_2017 = join_create_date('2017')

In [None]:
del df_transaction_2016, df_transaction_2017
gc.collect()

In [None]:
df_train = pd.concat([df_train_2016, df_train_2017])
# df_train.to_pickle(working_dir+'df_train_raw_all.pkl')
del df_train_2016, df_train_2017
gc.collect()

## Save Raw Data ##

In [None]:
df_labeled_train, df_labeled_val = train_test_split(df_train, test_size=0.33, random_state=0)

In [None]:
df_labeled_train.to_pickle(working_dir + 'df_labeled_raw_train.pkl')
df_labeled_val.to_pickle(working_dir + 'df_labeled_raw_val.pkl')

## Set Target Aside ##

In [None]:
y_labeled_train = df_labeled_train['logerror']
y_labeled_train.to_pickle(working_dir + 'y_labeled_train.pkl')
del y_labeled_train; gc.collect()

y_labeled_val = df_labeled_val['logerror']
y_labeled_val.to_pickle(working_dir + 'y_labeled_val.pkl')
del y_labeled_val; gc.collect()

## Drop Columns with High % of Missing ##

In [None]:
missing_threshold = 0.8
# column names of all columns with too many missing values
missing_data_features_set = set(tuple(df_labeled_train.columns[df_labeled_train.isnull().mean() > missing_threshold]))
del missing_threshold

In [None]:
# all column names, including target ##
columns_all = tuple(df_labeled_train.columns)

# binary column names, convert to boolean in next step
flag_features_set = set(('fireplaceflag', 'hashottuborspa', 'pooltypeid10', 'pooltypeid2', \
                         'pooltypeid7', 'taxdelinquencyflag')) - missing_data_features_set
flag_features = tuple(col for col in columns_all \
                            if (col in flag_features_set) == True)

# categorical 
categorical_features_set = set(('airconditioningtypeid', 'architecturalstyletypeid', \
                                'buildingclasstypeid', 'decktypeid', 'fips', 'heatingorsystemtypeid', \
                                'propertycountylandusecode', 'propertylandusetypeid', \
                                'propertyzoningdesc', 'rawcensustractandblock', 'censustractandblock', \
                                'regionidcounty', 'regionidcity', 'regionidzip', \
                                'regionidneighborhood', 'typeconstructiontypeid', 'assessmentyear', \
                                'taxdelinquencyyear', 'transaction_year', 'transaction_month'))\
                           - missing_data_features_set 
categorical_features_index = list(icol for icol, col in enumerate(columns_all) \
                                  if (col in categorical_features_set) == True)
categorical_features_names = tuple(col for col in columns_all \
                                   if (col in categorical_features_set) == True)

# numercial
continuous_features_set = set(columns_all) - categorical_features_set\
                          - set(['logerror', 'transactiondate']) - missing_data_features_set
continuous_features = tuple(col for col in columns_all \
                            if (col in continuous_features_set) == True)

del flag_features_set, continuous_features_set, categorical_features_set 

# Zillow's Home Value Prediction (Zestimate) #

## Load Packages ##

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy

import gc

from datetime import datetime 

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction import FeatureHasher
               
np.random.seed(0)