## Read sklearn and the yellow toolbox

In [1]:
yellow_follow = 'C:/Users/caspe/Desktop/yellow/lib'
# Local path, change this.
import sys; sys.path.append(yellow_follow) 
import sqlite3
import pandas as pd
import ml_utils
import numpy as np
from math import floor
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Read tensorflow

In [2]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_probability as tfp
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

## Load datasets and scale them

In [3]:
# Local folder
folder = "C:/Users/caspe/Desktop/Paper_2_StructuralVolume/buildings/"

in_path = folder + "buildings.sqlite"

db_cnx = sqlite3.connect(in_path)

df = pd.read_sql_query("SELECT * FROM 'buildings' WHERE buildings.area_vol_ratio >= 1 AND buildings.vol_sum > 1 ORDER BY RANDOM();", db_cnx)
df['pred_vol'] = np.nan

scaler = StandardScaler()
cols = ['area', 'perimeter', 'ipq']

In [4]:
# Ready the traning data
x = df_scaled = scaler.fit_transform(df[cols])
y = df[['vol_sum']].values

In [5]:
# Create a balance mask, to ensure the classes are balanced
labels = [50, 150, 300, 500, 800]
truth_labels = np.rot90(np.digitize(y, labels))[0]
freq = ml_utils.count_freq(truth_labels)
minority = freq.min(axis=0)[1]
balance_mask = ml_utils.minority_class_mask(truth_labels, minority)

## Investigate the classes

In [6]:
freq

array([[     0, 402273],
       [     1, 383556],
       [     2, 203936],
       [     3, 212904],
       [     4, 237773],
       [     5, 233828]], dtype=int64)

In [7]:
X = x[balance_mask]
Y = y[balance_mask]
Y_class = truth_labels[balance_mask]

# Deep Learning step

In [8]:
# Define model
def define_model(shape, name):
    model_input = Input(shape=shape, name="input")
    model = Dense(100, activation=tfa.activations.mish, kernel_initializer="he_normal")(model_input)
    model = Dense(50, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)
    model = Dense(20, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)
    model = Dense(5, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)

    predictions = Dense(1, activation="relu", dtype="float32")(model)

    return Model(inputs=[model_input], outputs=predictions)

In [9]:
# Define Optimizer
def define_optimizer():
    return tfa.optimizers.Lookahead(
        Adam(
            learning_rate=tfa.optimizers.TriangularCyclicalLearningRate(
                initial_learning_rate=1e-4,
                maximal_learning_rate=1e-2,
                step_size=6,
                scale_mode='cycle',
                name='TriangularCyclicalLearningRate',
            ),
            name="Adam",
        )
    )

In [10]:
# Metrics for testing model accuracy
def median_error(y_actual, y_pred):
    return tfp.stats.percentile(tf.math.abs(y_actual - y_pred), 50.0)

def abs_percentage(y_actual, y_pred):
    return tfp.stats.percentile(
        tf.divide(
            tf.abs(tf.subtract(y_actual, y_pred)), (y_actual + 1e-10)
        )
    , 50.0)

# Printing visuals
def pad(s, dl, dr):
    split = s.split('.')
    left = split[0]
    right = split[1]

    if len(left) < dl:
        left = ((dl - len(left)) * ' ') + left
    
    if len(right) < dr:
        right = right + ((dr - len(right)) * '0')
    
    return left + '.' + right

In [11]:
# Scores for the kfolds
scores =  { "mean_absolute_error": [], "median_absolute_error": [], "absolute_percentage_error": [] }
skf = StratifiedShuffleSplit(n_splits=5, test_size=0.1, train_size=0.5, random_state=42)

for train_index, test_index in skf.split(x, truth_labels):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    shape = X_train.shape[1]
    model = define_model(shape, "input")

    # Compile and test model
    model.compile(
        optimizer=define_optimizer(),
        loss='mean_absolute_error',
        metrics=[
            "mean_absolute_error",
            median_error,
            abs_percentage,
        ])

    model.fit(
        x=X_train,
        y=y_train,
        epochs=100,
        verbose=1,
        batch_size=1024,
        validation_split=0.2,
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                patience=12,
                min_delta=1.0,
                restore_best_weights=True,
            ),
        ]
    )

    # Evaluate model
    loss, mean_absolute_error, median_absolute_error, absolute_percentage_error = model.evaluate(X_test, y_test, verbose=1)
    mean_absolute_error = round(mean_absolute_error, 5)
    median_absolute_error = round(median_absolute_error, 5)
    absolute_percentage_error = round(absolute_percentage_error * 100, 5)

    scores["mean_absolute_error"].append(mean_absolute_error)
    scores["median_absolute_error"].append(median_absolute_error)
    scores["absolute_percentage_error"].append(absolute_percentage_error)

ean_absolute_error: 132.1161 - val_median_error: 30.7455 - val_abs_percentage: 0.2110
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [12]:
scores

{'mean_absolute_error': [127.90305, 130.25229, 127.31625, 129.29553, 132.1622],
 'median_absolute_error': [28.9619, 29.35582, 29.69861, 29.51954, 30.0962],
 'absolute_percentage_error': [17.62165,
  17.9152,
  17.96013,
  18.37344,
  17.94345]}

In [13]:
# scores
mean_err_mean = np.array(scores['mean_absolute_error']).mean()
mean_err_std = np.array(scores['mean_absolute_error']).std()

median_err_mean = np.array(scores['median_absolute_error']).mean()
median_err_std = np.array(scores['median_absolute_error']).std()

percentage_err_mean = np.array(scores['absolute_percentage_error']).mean()
percentage_err_std = np.array(scores['absolute_percentage_error']).std()

print("Test accuracy:")
print(f"Mean Absolute Error (MAE):          {pad(str(round(mean_err_mean, 3)), 3, 3)} ({pad(str(round(mean_err_std, 3)), 2, 3)} σ)")
print(f"Median Absolute Error (MAE):        {pad(str(round(median_err_mean, 3)), 3, 3)} ({pad(str(round(median_err_std, 3)), 2, 3)} σ)")
print(f"Absolute Percentage Error (MAPE):   {pad(str(round(percentage_err_mean, 3)), 3, 3)} ({pad(str(round(percentage_err_std, 3)), 2, 3)} σ)")

Test accuracy:
Mean Absolute Error (MAE):          129.386 ( 1.729 σ)
Median Absolute Error (MAE):         29.526 ( 0.375 σ)
Absolute Percentage Error (MAPE):    17.963 ( 0.240 σ)


In [14]:
# Run model on whole set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y_class)

shape = X_train.shape[1]
model = define_model(shape, "input")

# Compile and test model
model.compile(
    optimizer=define_optimizer(),
    loss='mean_absolute_error',
    metrics=[
        "mean_absolute_error",
        median_error,
        abs_percentage,
    ])

model.fit(
    x=X_train,
    y=y_train,
    epochs=100,
    verbose=1,
    batch_size=1024,
    validation_split=0.2,
    callbacks=[
        EarlyStopping(
            monitor="val_loss",
            patience=12,
            min_delta=1.0,
            restore_best_weights=True,
        ),
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


<tensorflow.python.keras.callbacks.History at 0x1cbb66025b0>

In [15]:
# Evaluate model
loss, mean_absolute_error, median_absolute_error, absolute_percentage_error = model.evaluate(X_test, y_test, verbose=1)

print("Test accuracy:")
print(f"Mean Absolute Error (MAE):        {pad(str(round(mean_absolute_error, 3)), 3, 3)}")
print(f"Median Absolute Error (MAE):      {pad(str(round(median_absolute_error, 3)), 3, 3)}")
print(f"Absolute Percentage Error (MAPE): {pad(str(round(absolute_percentage_error * 100, 3)), 3, 3)}")

Test accuracy:
Mean Absolute Error (MAE):        153.482
Median Absolute Error (MAE):       45.542
Absolute Percentage Error (MAPE):  18.359


In [16]:
# Lets add it back to the original data and visually inspect the results
df = pd.read_sql_query("SELECT * FROM 'buildings' ORDER BY RANDOM();", db_cnx)

scaler = StandardScaler()
cols = ['area', 'perimeter', 'ipq']

x = df_scaled = scaler.fit_transform(df[cols])

pred = model.predict(x)

In [17]:
# Add the predictions back to the list
df['pred_vol'] = pred
df['abs_p_err'] = (abs(df['vol_sum'] - df['pred_vol'])) / df['vol_sum']

# The original volume if it works, otherwise the infered volume.
df['use_vol'] = df['vol_sum']
df.loc[(df['area_vol_ratio'] < 1) | (df['vol_sum'] < 1), 'use_vol'] = df['pred_vol']

In [18]:
# Set the error to -1 when it's an infered value
df.loc[(df['area_vol_ratio'] < 1) | (df['vol_sum'] < 1), 'abs_p_err'] = -1

In [19]:
from sqlalchemy import create_engine

engine = create_engine('sqlite:///C:/Users/caspe/Desktop/Paper_2_StructuralVolume/buildings/buildings_pred.sqlite', echo=True)
sqlite_connection = engine.connect()

df.to_sql('buildings_pred', sqlite_connection, if_exists='fail')
sqlite_connection.close()

2021-01-25 12:25:48,865 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-01-25 12:25:48,866 INFO sqlalchemy.engine.base.Engine ()
2021-01-25 12:25:48,867 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-01-25 12:25:48,869 INFO sqlalchemy.engine.base.Engine ()
2021-01-25 12:25:48,873 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("buildings_pred")
2021-01-25 12:25:48,875 INFO sqlalchemy.engine.base.Engine ()
2021-01-25 12:25:48,877 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("buildings_pred")
2021-01-25 12:25:48,878 INFO sqlalchemy.engine.base.Engine ()
2021-01-25 12:25:48,880 INFO sqlalchemy.engine.base.Engine 
CREATE TABLE buildings_pred (
	"index" BIGINT, 
	ogc_fid BIGINT, 
	fid BIGINT, 
	hot_mean FLOAT, 
	vol_sum FLOAT, 
	area FLOAT, 
	perimeter FLOAT, 
	ipq FLOAT, 
	area_vol_ratio FLOAT, 
	pred_vol FLOAT, 
	abs_p_err FLOAT, 
	use_vol FLOAT
)


2021-01-25 12