In [1]:
# standard
import pandas as pd
import numpy as np
import random

# tf and keras
import tensorflow as tf
import keras
from keras import layers
from sklearn.model_selection import train_test_split


random.seed(42)

In [2]:
# This is reading the cleaned dataframe from the cleaning notebook
df = pd.read_csv('./data/nn.csv')
#pd.options.display.max_columns = None
df = df[~df.landtaxvaluedollarcnt.isnull()]
df = df[~df.taxamount.isnull()]
df = df[~df.regionidzip.isnull()]
df = df[~df.structuretaxvaluedollarcnt.isnull()]

In [3]:
X = df[['bedroomcnt','roomcnt','bathroomcnt','taxamount','landtaxvaluedollarcnt','taxvaluedollarcnt','structuretaxvaluedollarcnt',
        'latitude', 'longitude']]
Y = df.logerror

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=1234)
# Applying standardization to inputs
X_train_std = (X_train - X_train.mean())/X_train.std()
X_val_std = (X_val - X_train.mean())/X_train.std()
X_test_std = (X_test - X_train.mean())/X_train.std()

# Applying standardization to outputs
Y_train_std = (Y_train - Y_train.mean())/Y_train.std()
Y_val_std = (Y_val - Y_train.mean())/Y_train.std()
Y_test_std = (Y_test - Y_train.mean())/Y_train.std()

In [21]:

def build_model(lr):
    # Clear session and remove randomness.
    tf.keras.backend.clear_session()
    tf.random.set_seed(1234)
    np.random.seed(42)

    # Define input layers
    bedroomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='bedroomcnt')
    roomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='roomcnt')
    bathroomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='bathroomcnt')
    taxamount = layers.Input(shape=(1,), dtype=tf.float32, name='taxamount')
    landtaxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='landtaxvaluedollarcnt')
    taxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='taxvaluedollarcnt')
    structuretaxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='structuretaxvaluedollarcnt')
    latitude = layers.Input(shape=(1,), dtype=tf.float32, name='latitude')
    longitude = layers.Input(shape=(1,), dtype=tf.float32, name='longitude')

    resolution_in_degrees = 0.4

    # Create a list of numbers representing the bucket boundaries for latitude.
    latitude_boundaries = list(np.arange(-3, 3 + resolution_in_degrees, resolution_in_degrees))

    # Create a Discretization layer to separate the latitude data into buckets.
    latitude_discretized = tf.keras.layers.Discretization(
        bin_boundaries=latitude_boundaries,
        name='discretization_latitude')(latitude)

    # Create a list of numbers representing the bucket boundaries for longitude.
    longitude_boundaries = list(np.arange(-3, 3 + resolution_in_degrees, resolution_in_degrees))

    # Create a Discretization layer to separate the longitude data into buckets.
    longitude_discretized = tf.keras.layers.Discretization(
        bin_boundaries=longitude_boundaries,
        name='discretization_longitude')(longitude)

    # Cross the latitude and longitude features into a single one-hot vector.
    feature_cross = tf.keras.layers.HashedCrossing(
        num_bins=len(latitude_boundaries) * len(longitude_boundaries),
        output_mode='one_hot',
        name='cross_latitude_longitude')([latitude_discretized, longitude_discretized])

    # Concatenate features
    concatenated_features = layers.Concatenate()([
        bedroomcnt, 
        roomcnt, 
        bathroomcnt, 
        taxamount, 
        landtaxvaluedollarcnt, 
        taxvaluedollarcnt, 
        structuretaxvaluedollarcnt,
        feature_cross
    ])

    # Add hidden layers
    x = layers.Dense(units=400, kernel_initializer='normal', activation='relu')(concatenated_features)
    x = layers.Dropout(0.36)(x)
    x = layers.Dense(units=160, kernel_initializer='normal', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.6)(x)
    x = layers.Dense(1, kernel_initializer='normal')(x)

    # Output layer
    logerror = layers.Dense(
        units=1, activation='linear', name='logerror')(x)

    # Build the model
    model = tf.keras.Model(inputs=[
        bedroomcnt, 
        roomcnt, 
        bathroomcnt, 
        taxamount, 
        landtaxvaluedollarcnt, 
        taxvaluedollarcnt, 
        structuretaxvaluedollarcnt,
        latitude,
        longitude
    ], outputs=logerror)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='mae',
        metrics=['mae']
    )

    return model


In [22]:
model = build_model(lr=0.001)
model.summary()
#display(tf.keras.utils.plot_model(model))
random.seed(42)
tf.random.set_seed(1234)

history = model.fit(
    x={
        'bedroomcnt': X_train_std[['bedroomcnt']],
        'roomcnt': X_train_std[['roomcnt']],
        'bathroomcnt': X_train_std[['bathroomcnt']],
        'taxamount': X_train_std[['taxamount']],
        'landtaxvaluedollarcnt': X_train_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_train_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_train_std[['structuretaxvaluedollarcnt']],
        'latitude': X_train_std[['latitude']],
        'longitude': X_train_std[['longitude']],
    },
    y=Y_train_std,
    epochs=5,
    batch_size=128,
    validation_data=(
        {
            'bedroomcnt': X_val_std[['bedroomcnt']],
            'roomcnt': X_val_std[['roomcnt']],
            'bathroomcnt': X_val_std[['bathroomcnt']],
            'taxamount': X_val_std[['taxamount']],
            'landtaxvaluedollarcnt': X_val_std[['landtaxvaluedollarcnt']],
            'taxvaluedollarcnt': X_val_std[['taxvaluedollarcnt']],
            'structuretaxvaluedollarcnt': X_val_std[['structuretaxvaluedollarcnt']],
            'latitude': X_val_std[['latitude']],
            'longitude': X_val_std[['longitude']],
        },
        Y_val_std
    )
)


#show_history(history)

Epoch 1/5
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - loss: 0.5090 - mae: 0.5090 - val_loss: 0.4212 - val_mae: 0.4212
Epoch 2/5
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.4238 - mae: 0.4238 - val_loss: 0.4209 - val_mae: 0.4209
Epoch 3/5
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.4229 - mae: 0.4229 - val_loss: 0.4208 - val_mae: 0.4208
Epoch 4/5
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - loss: 0.4227 - mae: 0.4227 - val_loss: 0.4207 - val_mae: 0.4207
Epoch 5/5
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - loss: 0.4224 - mae: 0.4224 - val_loss: 0.4204 - val_mae: 0.4204


In [23]:
avg_error = Y_val_std.mean()
#y_true = np.array(Y)
#y_pred = np.ones(len(Y_val_std)) * avg_error
y_pred_main = np.zeros(len(Y_val_std)) * avg_error

def get_loss(y_true=Y_val_std, y_pred=y_pred_main):
    return tf.keras.losses.MAE(
    y_true, y_pred).numpy()


val_preds = model.predict({
        'bedroomcnt': X_val_std[['bedroomcnt']],
        'roomcnt': X_val_std[['roomcnt']],
        'bathroomcnt': X_val_std[['bathroomcnt']],
        'taxamount': X_val_std[['taxamount']],
        'landtaxvaluedollarcnt': X_val_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_val_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_val_std[['structuretaxvaluedollarcnt']],
        'latitude': X_val_std[['latitude']],
        'longitude': X_val_std[['longitude']],
    })

val_preds= (val_preds[:,0]*Y_train.std()) + Y_train.mean()

print(get_loss(y_true=Y_val))

print(get_loss(y_pred=val_preds, y_true=Y_val))

[1m1046/1046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step
0.06898731205788679
0.06841155
