## Read sklearn and the yellow toolbox

In [1]:
yellow_follow = 'C:/Users/caspe/Desktop/yellow/lib'

import sys; sys.path.append(yellow_follow) 
import sqlite3
import pandas as pd
import ml_utils
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

In [2]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

## Load datasets and scale them

In [3]:
# Local folder
folder = "C:/Users/caspe/Desktop/Paper_2_StructuralVolume/raw/"

in_path = folder + "buildings.sqlite"

db_cnx = sqlite3.connect(in_path)

df = pd.read_sql_query("SELECT * FROM 'buildings' WHERE buildings.area_vol_ratio >= 1 AND buildings.vol_sum > 1 ORDER BY RANDOM();", db_cnx)
df['pred_vol'] = np.nan

scaler = StandardScaler()
cols = ['area', 'perimeter', 'ipq']

In [4]:
# What is the mean building size?
df['vol_sum'].mean()

522.8423209885112

In [5]:
# What is the mean building size?
df['vol_sum'].median()

178.02704305648808

In [9]:
# How many are taken out for reevaluation?
df_total = pd.read_sql_query("SELECT * FROM 'buildings' WHERE buildings.area_vol_ratio < 1 OR buildings.vol_sum <= 1;", db_cnx)
print(f"Total out out sync: {len(df_total)}")
print(f"Total: {len(df)}")
print(f"Percent: {round((len(df_total) / len(df)) * 100, 2)}")
del df_total

Total out out sync: 125547
Total: 1674270
Percent: 7.5


In [10]:
# Ready the traning data
x = df_scaled = scaler.fit_transform(df[cols])
y = df[['vol_sum']].values

In [11]:
# Create a balance mask, to ensure the classes are balanced
labels = [50, 150, 300, 500, 800]
truth_labels = np.rot90(np.digitize(y, labels))[0]
freq = ml_utils.count_freq(truth_labels)
minority = freq.min(axis=0)[1]
balance_mask = ml_utils.minority_class_mask(truth_labels, minority)

## Investigate the classes

In [12]:
freq

array([[     0, 402273],
       [     1, 383556],
       [     2, 203936],
       [     3, 212904],
       [     4, 237773],
       [     5, 233828]], dtype=int64)

In [13]:
X = x[balance_mask]
Y = y[balance_mask]
Y_class = truth_labels[balance_mask]

# Deep Learning step

In [14]:
# Define model
def define_model(shape, name):
    model_input = Input(shape=shape, name="input")
    model = Dense(128, activation=tfa.activations.mish, kernel_initializer="he_normal")(model_input)
    model = Dense(64, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)
    model = Dense(16, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)
    model = Dense(8, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)

    predictions = Dense(1, activation="relu", dtype="float32")(model) # Relu because we know volume will always be positive

    return Model(inputs=[model_input], outputs=predictions)

In [15]:
# Define Optimizer
def define_optimizer():
    return tfa.optimizers.Lookahead(
        Adam(
            learning_rate=tfa.optimizers.TriangularCyclicalLearningRate(
                initial_learning_rate=1e-5,
                maximal_learning_rate=1e-2,
                step_size=9,
                scale_mode='cycle',
                name='TriangularCyclicalLearningRate',
            ),
            name="Adam",
        )
    )

In [16]:
# Scores for the kfolds
scores =  {
    "mean_absolute_error": [],
    "mean_absolute_percentage_error": [],
    "median_absolute_error": [],
    "median_absolute_percentage_error": [],
}

skf = StratifiedShuffleSplit(n_splits=5, test_size=0.1, train_size=0.5, random_state=42)

for train_index, test_index in skf.split(x, truth_labels):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    shape = X_train.shape[1]
    model = define_model(shape, "input")

    # Compile and test model
    model.compile(
        optimizer=define_optimizer(),
        loss='mean_absolute_error',
        metrics=[
            "mean_absolute_error",
            "mean_absolute_percentage_error",
            ml_utils.median_absolute_error,
            ml_utils.median_absolute_percentage_error,
        ])

    model.fit(
        x=X_train,
        y=y_train,
        epochs=100,
        verbose=1,
        batch_size=1024,
        validation_split=0.2,
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                patience=9,
                min_delta=1.0,
                restore_best_weights=True,
            ),
        ]
    )

    # Evaluate model
    loss, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error, median_absolute_percentage_error = model.evaluate(X_test, y_test, verbose=1)

    scores["mean_absolute_error"].append(mean_absolute_error)
    scores["mean_absolute_percentage_error"].append(mean_absolute_percentage_error)
    scores["median_absolute_error"].append(median_absolute_error)
    scores["median_absolute_percentage_error"].append(median_absolute_percentage_error)

rror: 18.1589 - val_loss: 178.0352 - val_mean_absolute_error: 178.0352 - val_mean_absolute_percentage_error: 28.2013 - val_median_absolute_error: 35.3929 - val_median_absolute_percentage_error: 23.9182
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [21]:
# scores
mae_mean = np.array(scores['mean_absolute_error']).mean()
mae_std = np.array(scores['mean_absolute_error']).std()

mape_mean = np.array(scores['mean_absolute_percentage_error']).mean()
mape_std = np.array(scores['mean_absolute_percentage_error']).std()

meae_mean = np.array(scores['median_absolute_error']).mean()
meae_std = np.array(scores['median_absolute_error']).std()

meape_mean = np.array(scores['median_absolute_percentage_error']).mean()
meapee_std = np.array(scores['median_absolute_percentage_error']).std()


print(f"    Combined Score:")
print(f"    Mean Absolute Error (MAE):               {ml_utils.pad(str(round(mae_mean, 3)), 5, 3)} ({ml_utils.pad(str(round(mae_std, 3)), 5, 3)} stdev)")
print(f"    Mean Absolute Percentage Error (MAPE):   {ml_utils.pad(str(round(mape_mean, 3)), 5, 3)} ({ml_utils.pad(str(round(mape_std, 3)), 5, 3)} stdev)")
print(f"    Median Absolute Error (MeAE):            {ml_utils.pad(str(round(meae_mean, 3)), 5, 3)} ({ml_utils.pad(str(round(meae_std, 3)), 5, 3)} stdev)")
print(f"    Median Absolute Percentage Error (MAPE): {ml_utils.pad(str(round(meape_mean, 3)), 5, 3)} ({ml_utils.pad(str(round(meapee_std, 3)), 5, 3)} stdev)")
print("")

    Combined Score:
    Mean Absolute Error (MAE):                 128.722 (    1.369 stdev)
    Mean Absolute Percentage Error (MAPE):      25.092 (    0.368 stdev)
    Median Absolute Error (MeAE):               29.269 (    0.618 stdev)
    Median Absolute Percentage Error (MAPE):    17.772 (    0.111 stdev)



In [23]:
# Run model on whole set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y_class)

shape = X_train.shape[1]
model = define_model(shape, "input")

# Compile and test model
model.compile(
    optimizer=define_optimizer(),
    loss='mean_absolute_error',
    metrics=[
        "mean_absolute_error",
        "mean_absolute_percentage_error",
        ml_utils.median_absolute_error,
        ml_utils.median_absolute_percentage_error,
    ])

model.fit(
    x=X_train,
    y=y_train,
    epochs=100,
    verbose=1,
    batch_size=1024,
    validation_split=0.2,
    callbacks=[
        EarlyStopping(
            monitor="val_loss",
            patience=9,
            min_delta=1.0,
            restore_best_weights=True,
        ),
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


<tensorflow.python.keras.callbacks.History at 0x1b136c46700>

In [25]:
# Evaluate model
loss, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error, median_absolute_percentage_error = model.evaluate(X_test, y_test, verbose=1)
print("Test accuracy:")

print(f"Mean Absolute Error (MAE):                 {ml_utils.pad(str(round(mean_absolute_error, 3)), 3, 3)}")
print(f"Mean Absolute Percentage Error (MAPE):     {ml_utils.pad(str(round(mean_absolute_percentage_error, 3)), 3, 3)}")
print(f"Median Absolute Error (MAE):               {ml_utils.pad(str(round(median_absolute_error, 3)), 3, 3)}")
print(f"Median Absolute Percentage Error (MeAPE):  {ml_utils.pad(str(round(median_absolute_percentage_error, 3)), 3, 3)}")

Test accuracy:
Mean Absolute Error (MAE):                 153.878
Mean Absolute Percentage Error (MAPE):      24.493
Median Absolute Error (MAE):                44.903
Median Absolute Percentage Error (MeAPE):   17.986


In [None]:
# Lets add it back to the original data and visually inspect the results
df = pd.read_sql_query("SELECT * FROM 'buildings' ORDER BY RANDOM();", db_cnx)

scaler = StandardScaler()
cols = ['area', 'perimeter', 'ipq']

x = df_scaled = scaler.fit_transform(df[cols])

pred = model.predict(x)

In [None]:
# Add the predictions back to the list
df['pred_vol'] = pred
df['abs_p_err'] = (abs(df['vol_sum'] - df['pred_vol'])) / df['vol_sum']

# The original volume if it works, otherwise the infered volume.
df['use_vol'] = df['vol_sum']
df.loc[(df['area_vol_ratio'] < 1) | (df['vol_sum'] < 1), 'use_vol'] = df['pred_vol']

In [None]:
# Set the error to -1 when it's an infered value
df.loc[(df['area_vol_ratio'] < 1) | (df['vol_sum'] < 1), 'abs_p_err'] = -1

In [None]:
from sqlalchemy import create_engine

engine = create_engine('sqlite:///C:/Users/caspe/Desktop/Paper_2_StructuralVolume/buildings/buildings_pred.sqlite', echo=True)
sqlite_connection = engine.connect()

df.to_sql('buildings_pred', sqlite_connection, if_exists='fail')
sqlite_connection.close()