In [3]:
# standard
import pandas as pd
import numpy as np
import random

# tf and keras
import tensorflow as tf
#import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# plots
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

# xgboost
import xgboost as xgb
import optuna
import optuna.visualization as vis
from functools import partial

random.seed(42)

In [17]:
# Read and clean dataframe

def create_dataframe(properties, train_2016, train_2017):

    # Get properties data
    df_properties = pd.read_csv(properties)

    # Get logerrors of transactions and merge
    df_2016 = pd.read_csv(train_2016, parse_dates=["transactiondate"])
    df_2017 = pd.read_csv(train_2017, parse_dates=["transactiondate"])
    df_logs = pd.concat([df_2016, df_2017])

    # Inner join transactions with the dataset of all properties
    df_all = pd.merge(df_logs, df_properties, on='parcelid', how='inner')

    return df_all


def clean_augment(df):

    # Remove some rows missing important data
    df = df.dropna(subset=['regionidcounty','landtaxvaluedollarcnt', 'taxamount', 
                               'regionidzip', 'structuretaxvaluedollarcnt'])

    # Now we read in the housing index for LA County, and add the index for each month
    dfh = pd.read_csv("https://docs.google.com/spreadsheets/d/1mk0WRuUrXjD9VlJa9EMavjDuWVfwPSeJJ22JUBoJaLU/pub?gid=0&single=true&output=csv")
    dfh.DATE = pd.to_datetime(dfh.DATE)
    df.transactiondate = pd.to_datetime(df.transactiondate)
    df['transactionmonth'] =  df['transactiondate'].dt.strftime('%Y%m')
    dfh.DATE = dfh['DATE'].dt.strftime('%Y%m')
    df = pd.merge(df, dfh, left_on='transactionmonth', right_on='DATE', how='left')
    df = df.drop(columns='DATE')

    # Get month, year, weekday
    df['month'] = df.transactionmonth.str[4:]
    df['year'] = df.transactionmonth.str[:-2]
    df['weekday'] = df.transactiondate.dt.day_of_week

    return df

def create_test_val(df):

    # Select only certain features from full dataset
    X = df[['bedroomcnt','roomcnt','bathroomcnt','taxamount','landtaxvaluedollarcnt',
            'taxvaluedollarcnt','structuretaxvaluedollarcnt', 'latitude', 'longitude', 
            'LXXRNSA', 'month', 'year', 'weekday', 'lotsizesquarefeet', 
            'calculatedfinishedsquarefeet', 'yearbuilt',
            ]]
    Y = df.logerror

    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=1234)

    # Applying standardization to inputs

    numeric_columns = ['bedroomcnt', 'roomcnt', 'bathroomcnt', 'taxamount',
        'landtaxvaluedollarcnt', 'taxvaluedollarcnt',
        'structuretaxvaluedollarcnt', 'latitude', 'longitude', 'LXXRNSA', 
            'lotsizesquarefeet', 'calculatedfinishedsquarefeet', 'yearbuilt',
            ]

    # Standardize numeric columns
    sc_x = StandardScaler()
    X_train_std = X_train.copy()
    X_val_std = X_val.copy()

    X_train_std[numeric_columns] = sc_x.fit(X_train[numeric_columns]).transform(X_train[numeric_columns])
    X_val_std[numeric_columns] = sc_x.fit(X_train[numeric_columns]).transform(X_val[numeric_columns])

    # Applying standardization to outputs
    Y_train_std = (Y_train - Y_train.mean())/Y_train.std()
    Y_val_std = (Y_val - Y_train.mean())/Y_train.std()

    # Mask missing data in columns - helps a tiny bit
    mask_value = -999
    X_train_std = X_train_std.fillna(mask_value)
    X_val_std = X_val_std.fillna(mask_value)

    y_mean = Y_train.mean()
    y_std = Y_train.std()

    return X_train_std, X_val_std, Y_train_std, Y_val_std, y_mean, y_std, X_train


def get_loss(y_true, y_pred):
    """This function provides mean absolute error"""

    return tf.keras.losses.MAE(y_true, y_pred).numpy()


def build_model(lr, resolution_in_degrees):
    """Use Keras functional API to create neural network model"""
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(1234)
    random.seed(42)
    mask_value = -999
    

    bedroomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='bedroomcnt')
    roomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='roomcnt')
    bathroomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='bathroomcnt')
    taxamount = layers.Input(shape=(1,), dtype=tf.float32, name='taxamount')
    landtaxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='landtaxvaluedollarcnt')
    taxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='taxvaluedollarcnt')
    structuretaxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='structuretaxvaluedollarcnt')
    latitude = layers.Input(shape=(1,), dtype=tf.float32, name='latitude')
    longitude = layers.Input(shape=(1,), dtype=tf.float32, name='longitude')
    lxxrnsa = layers.Input(shape=(1,), dtype=tf.float32, name='lxxrnsa')
    month = layers.Input(shape=(1,), dtype=tf.string, name='month')
    year = layers.Input(shape=(1,), dtype=tf.string, name='year')
    weekday = layers.Input(shape=(1,), dtype=tf.int64, name='weekday')

    
    lotsizesquarefeet = layers.Input(shape=(1,), dtype=tf.float32, name='lotsizesquarefeet')
    lotsizemask = layers.Masking(mask_value=mask_value)(lotsizesquarefeet)

    calculatedfinishedsquarefeet = layers.Input(shape=(1,), dtype=tf.float32, name='calculatedfinishedsquarefeet')
    finishedsqftmask = layers.Masking(mask_value=mask_value)(calculatedfinishedsquarefeet)

    yearbuilt = layers.Input(shape=(1,), dtype=tf.float32, name='yearbuilt')
    yearblt = layers.Masking(mask_value=mask_value)(yearbuilt)

    bedroomcnt_masked = layers.Masking(mask_value=mask_value)(bedroomcnt)
    roomcnt_masked = layers.Masking(mask_value=mask_value)(roomcnt)
    bathroomcnt_masked = layers.Masking(mask_value=mask_value)(bathroomcnt)
    taxamount_masked = layers.Masking(mask_value=mask_value)(taxamount)
    landtaxvaluedollarcnt_masked = layers.Masking(mask_value=mask_value)(landtaxvaluedollarcnt)
    taxvaluedollarcnt_masked = layers.Masking(mask_value=mask_value)(taxvaluedollarcnt)
    structuretaxvaluedollarcnt_masked = layers.Masking(mask_value=mask_value)(structuretaxvaluedollarcnt)

    # One hot encode month, year and weekday
    month_id = tf.keras.layers.StringLookup(
      vocabulary=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], 
      output_mode='one_hot')(month)
    
    year_id = tf.keras.layers.StringLookup(
      vocabulary=['2016', '2017'], 
      output_mode='one_hot')(year)   

    weekday_id = tf.keras.layers.IntegerLookup(
      vocabulary=[0,1,2,3,4,5,6], 
      output_mode='one_hot')(weekday) 


    # Create a list of numbers representing the bucket boundaries for latitude.
    latitude_boundaries = list(np.arange(-3, 3 + resolution_in_degrees, resolution_in_degrees))

    # Create a Discretization layer to separate the latitude data into buckets.
    latitude_discretized = tf.keras.layers.Discretization(
        bin_boundaries=latitude_boundaries,
        name='discretization_latitude')(latitude)

    # Create a list of numbers representing the bucket boundaries for longitude.
    longitude_boundaries = list(np.arange(-3, 3 + resolution_in_degrees, resolution_in_degrees))

    # Create a Discretization layer to separate the longitude data into buckets.
    longitude_discretized = tf.keras.layers.Discretization(
        bin_boundaries=longitude_boundaries,
        name='discretization_longitude')(longitude)

    # Cross the latitude and longitude features into a single one-hot vector.
    feature_cross = tf.keras.layers.HashedCrossing(
        num_bins=len(latitude_boundaries) * len(longitude_boundaries),
        output_mode='one_hot',
        name='cross_latitude_longitude')([latitude_discretized, longitude_discretized])

    features = layers.Concatenate()([
                    bedroomcnt_masked, 
                    roomcnt_masked, 
                    bathroomcnt_masked, 
                    taxamount_masked, 
                    landtaxvaluedollarcnt_masked, 
                    taxvaluedollarcnt_masked, 
                    structuretaxvaluedollarcnt_masked,
                    feature_cross,
                    lxxrnsa,
                    month_id,
                    year_id,
                    weekday_id,
                    lotsizemask,
                    finishedsqftmask,
                    yearblt,
    ]) 

    x = layers.Dense(units=600, kernel_initializer='normal', activation='relu')(features)
    x = layers.Dropout(0.36)(x)
    x = layers.Dense(units=200, kernel_initializer='normal', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.6)(x)
    x = layers.Dense(1, kernel_initializer='normal')(x)
    x = layers.Dense(1, kernel_initializer='normal')(x)

    logerror = tf.keras.layers.Dense(
        units=1, activation='linear', name='logerror')(x)

    model = tf.keras.Model(inputs=[
        bedroomcnt, 
        roomcnt, 
        bathroomcnt, 
        taxamount, 
        landtaxvaluedollarcnt, 
        taxvaluedollarcnt, 
        structuretaxvaluedollarcnt,
        latitude,
        longitude,
        lxxrnsa,
        month,
        year,
        weekday,
        lotsizesquarefeet,
        calculatedfinishedsquarefeet,
        yearbuilt,
    ], outputs=logerror)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='mae',
        metrics=['mae'])

    return model

# Define the objective function
def objective(trial, train_x, Y_train_std, val_x, Y_val_std):
    """Trial various hyperparameters for neural network model. Return validation MAE."""

    lr = trial.suggest_float("lr", 0.0005, 0.005)
    resolution_in_degrees = trial.suggest_float("resolution_in_degrees", 0.05, 0.7)
    epochs = trial.suggest_int("epochs", 5, 10)
    batch = trial.suggest_int("batch", 4000, 10000)

    random.seed(42)
    tf.random.set_seed(1234)    

    model = build_model(lr, resolution_in_degrees)
  
    # Train the model
    history = model.fit(
    x=train_x,
    y=Y_train_std,
    epochs= epochs,
    batch_size=batch,
    validation_data=(val_x, Y_val_std),
    verbose = 0,
)
    
    # Return the final validation MAE
    val_mae = history.history['val_mae'][-1]
    return val_mae




In [13]:
df = create_dataframe("./data/properties_2017.csv", "./data/train_2016_v2.csv", "./data/train_2017.csv")

df = clean_augment(df)

X_train_std, X_val_std, Y_train_std, Y_val_std, y_mean, y_std, X_train = create_test_val(df)


print(get_loss(Y_val_std, y_pred=np.zeros(len(Y_val_std))))

train_x = {
        'bedroomcnt': X_train_std[['bedroomcnt']],
        'roomcnt': X_train_std[['roomcnt']],
        'bathroomcnt': X_train_std[['bathroomcnt']],
        'taxamount': X_train_std[['taxamount']],
        'landtaxvaluedollarcnt': X_train_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_train_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_train_std[['structuretaxvaluedollarcnt']],
        'latitude': X_train_std[['latitude']],
        'longitude': X_train_std[['longitude']],
        'lxxrnsa': X_train_std[['LXXRNSA']],
        'year': X_train_std[['year']],  
        'month': X_train_std[['month']],  
        'weekday': X_train_std[['weekday']],
        'lotsizesquarefeet': X_train_std[['lotsizesquarefeet']],
        'calculatedfinishedsquarefeet': X_train_std[['calculatedfinishedsquarefeet']],
        'yearbuilt': X_train_std[['yearbuilt']],

    }

val_x = {
        'bedroomcnt': X_val_std[['bedroomcnt']],
        'roomcnt': X_val_std[['roomcnt']],
        'bathroomcnt': X_val_std[['bathroomcnt']],
        'taxamount': X_val_std[['taxamount']],
        'landtaxvaluedollarcnt': X_val_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_val_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_val_std[['structuretaxvaluedollarcnt']],
        'latitude': X_val_std[['latitude']],
        'longitude': X_val_std[['longitude']],
        'lxxrnsa': X_val_std[['LXXRNSA']],
        'year': X_val_std[['year']], 
        'month': X_val_std[['month']], 
        'weekday': X_val_std[['weekday']],
        'lotsizesquarefeet': X_val_std[['lotsizesquarefeet']],
        'calculatedfinishedsquarefeet': X_val_std[['calculatedfinishedsquarefeet']],
        'yearbuilt': X_val_std[['yearbuilt']],
        }

# Create a study object and optimize the objective function
objective_partial = partial(objective, train_x=train_x, Y_train_std=Y_train_std, val_x=val_x, Y_val_std=Y_val_std)
study = optuna.create_study(direction='minimize')
study.optimize(objective_partial, n_trials=100, timeout=900)

# Print the best parameters
print(f"Best Parameters: {study.best_params}")

# Extracting the best parameters from the study
best_params = study.best_params

# Unpacking the best parameters into individual variables
lr = best_params["lr"]
resolution_in_degrees = best_params["resolution_in_degrees"]
epochs = best_params["epochs"]
batch = best_params["batch"]

model = build_model(lr, resolution_in_degrees)
#model.summary()
#display(tf.keras.utils.plot_model(model))
random.seed(42)
tf.random.set_seed(1234)

# Fit model
history = model.fit(
    x=train_x,
    y=Y_train_std,
    epochs=epochs,
    batch_size=batch,
    validation_data=(val_x, Y_val_std)
)

  df_properties = pd.read_csv(properties)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.transactiondate = pd.to_datetime(df.transactiondate)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['transactionmonth'] =  df['transactiondate'].dt.strftime('%Y%m')
[I 2024-08-09 19:54:23,806] A new study created in memory with name: no-name-5274b549-50b8-446f-8dab-7b4d28488d26


0.4320137920376952



[I 2024-08-09 19:55:01,770] Trial 0 finished with value: 0.42657366394996643 and parameters: {'lr': 0.0006131942188513374, 'resolution_in_degrees': 0.41857442553536406, 'epochs': 6, 'batch': 4068}. Best is trial 0 with value: 0.42657366394996643.
[I 2024-08-09 19:59:36,957] Trial 1 finished with value: 0.4266696870326996 and parameters: {'lr': 0.0012454424441563708, 'resolution_in_degrees': 0.06939509839075729, 'epochs': 8, 'batch': 7550}. Best is trial 0 with value: 0.42657366394996643.
[I 2024-08-09 20:00:22,427] Trial 2 finished with value: 0.42691320180892944 and parameters: {'lr': 0.0012030960791913946, 'resolution_in_degrees': 0.6223482008844723, 'epochs': 10, 'batch': 9362}. Best is trial 0 with value: 0.42657366394996643.
[I 2024-08-09 20:01:31,108] Trial 3 finished with value: 0.42654189467430115 and parameters: {'lr': 0.0027568720769117973, 'resolution_in_degrees': 0.14623292035427857, 'epochs': 6, 'batch': 9846}. Best is trial 3 with value: 0.42654189467430115.
[I 2024-08-09

Best Parameters: {'lr': 0.0036659881923132654, 'resolution_in_degrees': 0.11062887130622294, 'epochs': 9, 'batch': 6579}
Epoch 1/9
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 645ms/step - loss: 0.4279 - mae: 0.4279 - val_loss: 0.4272 - val_mae: 0.4272
Epoch 2/9
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 560ms/step - loss: 0.4244 - mae: 0.4244 - val_loss: 0.4268 - val_mae: 0.4268
Epoch 3/9
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 675ms/step - loss: 0.4239 - mae: 0.4239 - val_loss: 0.4264 - val_mae: 0.4264
Epoch 4/9
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 617ms/step - loss: 0.4237 - mae: 0.4237 - val_loss: 0.4264 - val_mae: 0.4264
Epoch 5/9
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 769ms/step - loss: 0.4236 - mae: 0.4236 - val_loss: 0.4260 - val_mae: 0.4260
Epoch 6/9
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 786ms/step - loss: 0.4234 - mae: 0.4234 - va

In [21]:
val_preds = model.predict(val_x)

# Convert to regular scale from scaled-standardized scale and print loss
val_preds= (val_preds[:,0]*y_std) + y_mean
Y_val = (Y_val_std*y_std) + y_mean
print(get_loss(y_pred=val_preds, y_true=Y_val))

[1m1046/1046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step
0.068795204


### XGBoost

In [22]:
def xgboost_train(d_train, watchlist, best_opts):   

    clf = xgb.train(
    best_opts,
    d_train,
    num_boost_round=1500,  # Choose a suitable number of boosting rounds
    evals=watchlist,
    early_stopping_rounds=100,
    verbose_eval=10
)

    return clf

# Define the objective function
def objective_xg(trial, d_train, watchlist):
    """Tune hyperparameters for XGBoost model"""
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'eta': trial.suggest_float('eta', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    }
        
    # Train the model
    model = xgb.train(params, d_train, num_boost_round=1000, evals=watchlist,
                      early_stopping_rounds=100, verbose_eval=False)
    
    # Return the best validation MAE
    val_mae = model.best_score
    return val_mae

### Hyperparameter Tuning and run XGBoost

In [24]:
# Get train and validation predictions from neural network as inputs for XGBoost

train_preds = model.predict(train_x)
val_preds = model.predict(val_x)

# Create XGBoost matrices
d_train = xgb.DMatrix(train_preds, label = Y_train_std)
d_valid = xgb.DMatrix(val_preds, label = Y_val_std)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Create a study object and optimize the objective function
objective_partial = partial(objective_xg, d_train=d_train, watchlist=watchlist)
study = optuna.create_study(direction='minimize')
study.optimize(objective_partial, n_trials=100, timeout=600)

# Print the best parameters
print(f"Best Parameters: {study.best_params}")

best_opts= study.best_params
best_opts['eval_metric'] = "mae"
clf = xgboost_train(d_train, watchlist, best_opts)

# Get final loss, it's near-identical
xgb_val = (clf.predict(d_valid)*y_std) + y_mean
get_loss(y_pred=xgb_val, y_true=Y_val)

[I 2024-08-09 20:15:19,150] A new study created in memory with name: no-name-919e00c6-22c5-478e-86da-c0220af5a46d
[I 2024-08-09 20:15:22,812] Trial 0 finished with value: 0.42938613166157763 and parameters: {'eta': 0.08453015953529507, 'max_depth': 14, 'subsample': 0.9413850995100834, 'colsample_bytree': 0.8049683030578855, 'lambda': 0.0009959786037431447, 'alpha': 0.00018946646180184604}. Best is trial 0 with value: 0.42938613166157763.
[I 2024-08-09 20:15:26,224] Trial 1 finished with value: 0.42926785434453424 and parameters: {'eta': 0.021589700559158577, 'max_depth': 11, 'subsample': 0.7479367992431482, 'colsample_bytree': 0.6685829909018006, 'lambda': 2.473902102439808e-08, 'alpha': 0.00013482539439948433}. Best is trial 1 with value: 0.42926785434453424.
[I 2024-08-09 20:15:28,600] Trial 2 finished with value: 0.429413282144301 and parameters: {'eta': 0.09876866734724224, 'max_depth': 15, 'subsample': 0.9645997677376954, 'colsample_bytree': 0.958866723807715, 'lambda': 0.00564492

Best Parameters: {'eta': 0.06213686026426219, 'max_depth': 1, 'subsample': 0.6009097688608647, 'colsample_bytree': 0.9705236380361846, 'lambda': 0.6830910014303269, 'alpha': 1.0951123996164474e-08}
[0]	train-mae:0.42567	valid-mae:0.43161
[10]	train-mae:0.42351	valid-mae:0.42957
[20]	train-mae:0.42263	valid-mae:0.42885
[30]	train-mae:0.42220	valid-mae:0.42852
[40]	train-mae:0.42198	valid-mae:0.42843
[50]	train-mae:0.42194	valid-mae:0.42850
[60]	train-mae:0.42197	valid-mae:0.42859
[70]	train-mae:0.42201	valid-mae:0.42869
[80]	train-mae:0.42205	valid-mae:0.42876
[90]	train-mae:0.42198	valid-mae:0.42876
[100]	train-mae:0.42214	valid-mae:0.42895
[110]	train-mae:0.42233	valid-mae:0.42918
[120]	train-mae:0.42232	valid-mae:0.42917
[130]	train-mae:0.42234	valid-mae:0.42919
[137]	train-mae:0.42224	valid-mae:0.42908


0.0693538

In [245]:
# Optimization history plot
opt_history_fig = vis.plot_optimization_history(study)
opt_history_fig.show()

In [None]:
# Parameter importance plot
param_importance_fig = vis.plot_param_importances(study)
param_importance_fig.show()

In [None]:
# Slice plot
slice_fig = vis.plot_slice(study)
slice_fig.show()

### Create test data

In [2]:
def read_test_data():
    sample = pd.read_csv('./data/sample_submission.csv')
    prop = pd.read_csv('./data/properties_2017.csv')
    sample['parcelid'] = sample['ParcelId']
    df_test = sample.merge(prop, on='parcelid', how = 'left')

    return df_test, sample

def create_clean_test(df_test, X_train):

    train_columns =  ['bedroomcnt', 'roomcnt', 'bathroomcnt', 'taxamount',
        'landtaxvaluedollarcnt', 'taxvaluedollarcnt',
        'structuretaxvaluedollarcnt', 'latitude', 'longitude',
        'lotsizesquarefeet', 'calculatedfinishedsquarefeet', 'yearbuilt',
        ]
    x_test = df_test[train_columns]

    # Set the transaction date dependent columns to constants
    x_test['LXXRNSA'] = 253.028
    x_test['month'] = "12"
    x_test['year'] = "2016"
    x_test['weekday'] = 4

    X_test_std = x_test.copy()

    # Scale-standardize features
    numeric_columns = ['bedroomcnt', 'roomcnt', 'bathroomcnt', 'taxamount',
        'landtaxvaluedollarcnt', 'taxvaluedollarcnt',
        'structuretaxvaluedollarcnt', 'latitude', 'longitude', 'LXXRNSA', 
            'lotsizesquarefeet', 'calculatedfinishedsquarefeet', 'yearbuilt',
            ]
    sc_x = StandardScaler()
    sc_x.fit(X_train[numeric_columns])

    # Transform the test data using the same fitted scaler
    X_test_std[numeric_columns] = sc_x.transform(X_test_std[numeric_columns])

    # For latitude and longitude, we can't mask, so fill NAs with zeros
    X_test_std.longitude = X_test_std.longitude.fillna(0)
    X_test_std.latitude = X_test_std.latitude.fillna(0)

    # Mask missing data in columns 
    mask_value = -999
    X_test_std = X_test_std.fillna(mask_value)

    return X_test_std

  prop = pd.read_csv('./data/properties_2017.csv')


In [117]:
df_test, sample = read_test_data()
X_test_std = create_clean_test(df_test, X_train)

# Generate neural network predictions for all parcel IDs
preds = model.predict({
        'bedroomcnt': X_test_std[['bedroomcnt']],
        'roomcnt': X_test_std[['roomcnt']],
        'bathroomcnt': X_test_std[['bathroomcnt']],
        'taxamount': X_test_std[['taxamount']],
        'landtaxvaluedollarcnt': X_test_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_test_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_test_std[['structuretaxvaluedollarcnt']],
        'latitude': X_test_std[['latitude']],
        'longitude': X_test_std[['longitude']],
        'lxxrnsa': X_test_std[['LXXRNSA']],
        'year': X_test_std[['year']],  
        'month': X_test_std[['month']],  
        'weekday': X_test_std[['weekday']],
        'lotsizesquarefeet': X_test_std[['lotsizesquarefeet']],
        'calculatedfinishedsquarefeet': X_test_std[['calculatedfinishedsquarefeet']],
        'yearbuilt': X_test_std[['yearbuilt']],
    })

[1m93289/93289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2894s[0m 31ms/step


In [58]:
for c in sample.columns[sample.columns != 'ParcelId']:
    sample[c] = ((preds[:,0]*y_std) + y_mean)


In [59]:
sample

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.005542,0.005542,0.005542,0.005542,0.005542,0.005542
1,10759547,0.005886,0.005886,0.005886,0.005886,0.005886,0.005886
2,10843547,0.563296,0.563296,0.563296,0.563296,0.563296,0.563296
3,10859147,0.016259,0.016259,0.016259,0.016259,0.016259,0.016259
4,10879947,0.006889,0.006889,0.006889,0.006889,0.006889,0.006889
...,...,...,...,...,...,...,...
2985212,168176230,0.010419,0.010419,0.010419,0.010419,0.010419,0.010419
2985213,14273630,0.011146,0.011146,0.011146,0.011146,0.011146,0.011146
2985214,168040630,0.009144,0.009144,0.009144,0.009144,0.009144,0.009144
2985215,168040830,0.007757,0.007757,0.007757,0.007757,0.007757,0.007757


In [60]:
# Save neural network predictions to csv
sample.to_csv('nn1.csv', index=False, float_format='%.4f')

In [None]:
# Create XGBoost predictions from neural network predictions
d_test_1 = xgb.DMatrix(preds)
xg_preds_1 = (clf.predict(d_test_1)*y_std) + y_mean

In [174]:
for c in sample.columns[sample.columns != 'ParcelId']:
    sample[c] = xg_preds_1

In [175]:
sample

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.009251,0.009251,0.009251,0.009251,0.009251,0.009251
1,10759547,0.009251,0.009251,0.009251,0.009251,0.009251,0.009251
2,10843547,0.073047,0.073047,0.073047,0.073047,0.073047,0.073047
3,10859147,0.021976,0.021976,0.021976,0.021976,0.021976,0.021976
4,10879947,0.009251,0.009251,0.009251,0.009251,0.009251,0.009251
...,...,...,...,...,...,...,...
2985212,168176230,0.014024,0.014024,0.014024,0.014024,0.014024,0.014024
2985213,14273630,0.014236,0.014236,0.014236,0.014236,0.014236,0.014236
2985214,168040630,0.013153,0.013153,0.013153,0.013153,0.013153,0.013153
2985215,168040830,0.009730,0.009730,0.009730,0.009730,0.009730,0.009730


In [176]:
# Save combination model outputs to csv
sample.to_csv('nn1+xg.csv', index=False, float_format='%.4f')
