# Summary Statistics Approach

In [11]:
# Local path, change this.
yellow_follow = 'C:/Users/caspe/Desktop/yellow/lib/'

import sys; sys.path.append(yellow_follow) 
import sqlite3
import pandas as pd
import ml_utils
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [12]:
# Tensorflow
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Load data

In [13]:
# Load data
folder = "C:/Users/caspe/Desktop/Paper_2_StructuralVolume/"
in_path = folder + "grid_train.sqlite"

db_cnx = sqlite3.connect(in_path)
df = pd.read_sql_query("SELECT * FROM 'grid_train';", db_cnx)

In [14]:
# load test data
in_path_test = folder + "grid_test.sqlite"
db_cnx_test = sqlite3.connect(in_path_test)
dft = pd.read_sql_query("SELECT * FROM 'grid_test';", db_cnx_test)
y_test = dft['volume']

In [15]:
# Easy reference to the different features in the datasets.
s2 = [
    'b04_mean', 'b04_stdev', 'b04_min', 'b04_max',
    'b08_mean', 'b08_stdev', 'b08_min', 'b08_max',
    'b04t_mean', 'b04t_stdev', 'b04t_min', 'b04t_max',
    'b08t_mean', 'b08t_stdev', 'b08t_min', 'b08t_max',
]

bs_asc = ['bs_asc_mean', 'bs_asc_stdev', 'bs_asc_min', 'bs_asc_max']
bs_desc = ['bs_desc_mean', 'bs_desc_stdev', 'bs_desc_min', 'bs_desc_max']
coh_asc = ['coh_asc_mean', 'coh_asc_stdev', 'coh_asc_min', 'coh_asc_max']
coh_desc = ['coh_desc_mean', 'coh_desc_stdev', 'coh_desc_min', 'coh_desc_max']

nl = ['nl_mean', 'nl_stdev', 'nl_min', 'nl_max']

In [None]:
test_municipalities = ['Skive', 'Silkborg', 'Aarhus']

# Balance the datasets (Optional)

In [None]:
# Remove outliers
y_start = df['volume'].values
y_nonzero = y_start[y_start > 0]

median = np.median(y_nonzero)
madstd = np.median(np.abs(y_nonzero - median)) * 1.4826

lower_bound = median - (3 * madstd)
upper_bound = median + (3 * madstd)

top_diff = len(y_start) - len(y_start[y_start <= upper_bound])
low_diff = len(y_start) - len(y_start[y_start >= lower_bound])

outlier_mask = (y_start >= lower_bound) & (y_start <= upper_bound)
df = df[outlier_mask]

In [None]:
# Zero mask - keep only 10% of tiles with no buildings
ten_percent = df[df['volume'] == 0].sample(int(len(df[df['volume'] != 0]) * 0.10))
df_without = df[df['volume'] != 0]
df = df_without.append(ten_percent)

In [None]:
# Create a balance mask, to ensure the classes are balanced
q25 = int(df['volume'].quantile(0.25))
q50 = int(df['volume'].median())
q75 = int(df['volume'].quantile(0.75))
labels = [q25, q50, q75]

y_class = np.digitize(df['volume'], labels)
freq = ml_utils.count_freq(y_class)

# Investigate classes
freq

# Define target values

In [16]:
y = df['volume'].values
# y = df['people'].values
# y = df['area'].values

# Define the neural network

In [17]:
# Define model
def define_model(shape, name):
    model_input = Input(shape=shape, name="input")
    model = Dense(1024, activation=tfa.activations.mish, kernel_initializer="he_normal")(model_input)
    model = Dropout(0.2)(model)
    model = BatchNormalization()(model)
    model = Dense(256, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)
    model = BatchNormalization()(model)
    model = Dense(64, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)
    model = Dense(16, activation=tfa.activations.mish, kernel_initializer="he_normal")(model)

    predictions = Dense(1, activation='relu')(model)

    return Model(inputs=[model_input], outputs=predictions)

In [18]:
# Define Optimizer
def define_optimizer():
    return tfa.optimizers.Lookahead(
        Adam(
            learning_rate=tfa.optimizers.TriangularCyclicalLearningRate(
                initial_learning_rate=1e-5,
                maximal_learning_rate=1e-2,
                step_size=9,
                scale_mode='cycle',
                name='TriangularCyclicalLearningRate',
            ),
            name="Adam",
        )
    )

# Start analysis

In [19]:
analysis = [
    # nl,
    # s2,
    # bs_asc,
    # bs_desc,
    # bs_asc + bs_desc, 
    # bs_asc + coh_asc,
    # bs_desc + coh_desc,
    # bs_asc + coh_asc + s2,
    # bs_asc + coh_asc + bs_desc + coh_desc,
    bs_asc + coh_asc + bs_desc + coh_desc + s2,
    # bs_asc + coh_asc + bs_desc + coh_desc + s2 + nl,
]

# Testing all combinations (Optional)

In [None]:
all_scores = []

for a in analysis:
    x = df[a].values
    X_test = dft[a].values

    # Scores for the kfolds
    scores = { "mean_absolute_error": [], "median_absolute_error": [], "absolute_percentage_error": [] }
    skf = StratifiedShuffleSplit(n_splits=3, test_size=0.1, train_size=0.5, random_state=42)

    for train_index, test_index in skf.split(x, y_class):
        X_train = x[train_index]
        y_train = y[train_index]

        shape = X_train.shape[1]
        model = define_model(shape, "input")

        # Compile and test model
        model.compile(
            optimizer=define_optimizer(),
            loss='mean_absolute_error',
            metrics=[
                "mean_absolute_error",
                ml_utils.median_error,
                ml_utils.abs_percentage,
            ])

        model.fit(
            x=X_train,
            y=y_train,
            epochs=100,
            verbose=1,
            batch_size=512,
            validation_split=0.1,
            callbacks=[
                EarlyStopping(
                    monitor="val_loss",
                    patience=9,
                    min_delta=5.0,
                    restore_best_weights=True,
                ),
            ]
        )

        # Evaluate model
        loss, mean_absolute_error, median_absolute_error, absolute_percentage_error = model.evaluate(X_test, y_test, verbose=1)
        mean_absolute_error = round(mean_absolute_error, 5)
        median_absolute_error = round(median_absolute_error, 5)
        absolute_percentage_error = round(absolute_percentage_error * 100, 5)

        scores["mean_absolute_error"].append(mean_absolute_error)
        scores["median_absolute_error"].append(median_absolute_error)
        scores["absolute_percentage_error"].append(absolute_percentage_error)

    all_scores.append(scores)

In [None]:
# Visualise the scores
names = [
    # 'nl',
    # 's2',
    # 'bs_asc',
    # 'bs_desc',
    # 'bs_asc + bs_desc',
    # 'bs_asc + coh_asc',
    # 'bs_desc + coh_desc',
    # 'bs_asc + coh_asc + s2',
    # 'bs_asc + coh_asc + bs_desc + coh_desc',
    'bs_asc + coh_asc + bs_desc + coh_desc + s2',
    # 'bs_asc + coh_asc + bs_desc + coh_desc + s2 + nl',
]

for i, score in enumerate(all_scores):
    mean_err_mean = np.array(score['mean_absolute_error']).mean()
    mean_err_std = np.array(score['mean_absolute_error']).std()

    median_err_mean = np.array(score['median_absolute_error']).mean()
    median_err_std = np.array(score['median_absolute_error']).std()

    percentage_err_mean = np.array(score['absolute_percentage_error']).mean()
    percentage_err_std = np.array(score['absolute_percentage_error']).std()

    print(f"Test: {names[i]}")
    print(f"Mean Absolute Error (MAE):          {ml_utils.pad(str(round(mean_err_mean, 3)), 3, 3)} ({pad(str(round(mean_err_std, 3)), 2, 3)} σ)")
    print(f"Median Absolute Error (MAE):        {ml_utils.pad(str(round(median_err_mean, 3)), 3, 3)} ({pad(str(round(median_err_std, 3)), 2, 3)} σ)")
    print(f"Absolute Percentage Error (MAPE):   {ml_utils.pad(str(round(percentage_err_mean, 3)), 3, 3)} ({pad(str(round(percentage_err_std, 3)), 2, 3)} σ)")
    print('')

Test: nl
Mean Absolute Error (MAE):          1937.726 (19.739 σ)
Median Absolute Error (MAE):        1152.227 (42.230 σ)
Absolute Percentage Error (MAPE):   575139033600.000 (326613017813.593 σ)

Test: s2
Mean Absolute Error (MAE):          1360.342 ( 8.920 σ)
Median Absolute Error (MAE):        769.170 (32.236 σ)
Absolute Percentage Error (MAPE):    55.539 ( 0.280 σ)

Test: bs_asc
Mean Absolute Error (MAE):          1750.128 (75.549 σ)
Median Absolute Error (MAE):        852.467 (25.647 σ)
Absolute Percentage Error (MAPE):    85.540 ( 6.676 σ)

Test: bs_asc + coh_asc
Mean Absolute Error (MAE):          1569.794 ( 1.177 σ)
Median Absolute Error (MAE):        844.630 (22.201 σ)
Absolute Percentage Error (MAPE):    77.257 ( 2.631 σ)

Test: bs_desc + coh_desc
Mean Absolute Error (MAE):          1618.776 (32.459 σ)
Median Absolute Error (MAE):        853.437 (40.073 σ)
Absolute Percentage Error (MAPE):    76.257 ( 2.651 σ)

Test: bs_asc + coh_asc + s2
Mean Absolute Error (MAE):          1215.719 ( 3.477 σ)
Median Absolute Error (MAE):        675.802 (29.869 σ)
Absolute Percentage Error (MAPE):    50.311 ( 0.709 σ)

Test: bs_asc + coh_asc + bs_desc + coh_desc
Mean Absolute Error (MAE):          1423.093 (44.561 σ)
Median Absolute Error (MAE):        753.970 (11.702 σ)
Absolute Percentage Error (MAPE):    65.604 ( 0.135 σ)

Test: bs_asc + coh_asc + bs_desc + coh_desc + s2
Mean Absolute Error (MAE):          1157.710 (10.546 σ)
Median Absolute Error (MAE):        607.587 (13.968 σ)
Absolute Percentage Error (MAPE):    48.079 ( 1.347 σ)

Test: bs_asc + coh_asc + bs_desc + coh_desc + s2
Mean Absolute Error (MAE):          1152.406 ( 4.064 σ)
Median Absolute Error (MAE):        597.365 (17.253 σ)
Absolute Percentage Error (MAPE):    47.521 ( 1.166 σ)

Test: bs_asc + coh_asc + bs_desc + coh_desc + s2 + nl
Mean Absolute Error (MAE):          1160.467 (11.254 σ)
Median Absolute Error (MAE):        635.434 (22.695 σ)
Absolute Percentage Error (MAPE):    47.410 ( 0.671 σ)

# Evaluate the model on the two training municipalities

In [20]:
xf = df[analysis[0]].values
X_test = dft[analysis[0]].values

# TRAIN ON ALL DATA
shape = xf.shape[1]
model = define_model(shape, "input")

# Compile and test model
model.compile(
    optimizer=define_optimizer(),
    loss='mean_absolute_error',
    metrics=[
        "mean_absolute_error",
        ml_utils.median_error,
        ml_utils.abs_percentage,
    ])

model.fit(
    x=xf,
    y=y,
    epochs=100,
    verbose=1,
    batch_size=1024,
    validation_split=0.1,
    callbacks=[
        EarlyStopping(
            monitor="val_loss",
            patience=9,
            min_delta=5.0,
            restore_best_weights=True,
        ),
    ]
)

# Evaluate model
loss, mean_absolute_error, median_absolute_error, absolute_percentage_error = model.evaluate(X_test, y_test, verbose=1)
mean_absolute_error = round(mean_absolute_error, 5)
median_absolute_error = round(median_absolute_error, 5)
absolute_percentage_error = round(absolute_percentage_error * 100, 5)

print(f"Mean Absolute Error (MAE):          {ml_utils.pad(str(round(mean_absolute_error, 3)), 3, 3)}")
print(f"Median Absolute Error (MAE):        {ml_utils.pad(str(round(median_absolute_error, 3)), 3, 3)}")
print(f"Absolute Percentage Error (MAPE):   {ml_utils.pad(str(round(absolute_percentage_error, 3)), 3, 3)}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Mean Absolute Error (MAE):          1346.121
Median Absolute Error (MAE):        290.775
Absolute Percentage Error (MAPE):     3.564


# Output the model to a new sqlite file.

In [None]:
from sqlalchemy import create_engine

pred = model.predict(X_test)
dft['pred_vol'] = pred

engine = create_engine('sqlite:///C:/Users/caspe/Desktop/Paper_2_StructuralVolume/grid_test_pred_s1_s2.sqlite', echo=True)
sqlite_connection = engine.connect()

dft.to_sql('grid_test_pred_s1_s2', sqlite_connection, if_exists='fail')
sqlite_connection.close()

S1 + S2
Mean Absolute Error (MAE):          1342.747
Median Absolute Error (MAE):        287.852
Absolute Percentage Error (MAPE):     3.392

S2
Mean Absolute Error (MAE):          1676.217
Median Absolute Error (MAE):        372.632
Absolute Percentage Error (MAPE):     4.936

S2 - No textures
Mean Absolute Error (MAE):          2350.090
Median Absolute Error (MAE):        604.787
Absolute Percentage Error (MAPE):     9.112

S1 (COH)
Mean Absolute Error (MAE):          2024.537
Median Absolute Error (MAE):        525.089
Absolute Percentage Error (MAPE):     6.573

S1
Mean Absolute Error (MAE):          2177.647
Median Absolute Error (MAE):        589.839
Absolute Percentage Error (MAPE):     7.302

ASC (COH)
Mean Absolute Error (MAE):          2376.465
Median Absolute Error (MAE):        642.327
Absolute Percentage Error (MAPE):     8.725

ASC
Mean Absolute Error (MAE):          2588.544
Median Absolute Error (MAE):        727.163
Absolute Percentage Error (MAPE):     9.234

DESC (COH)
Mean Absolute Error (MAE):          2459.001
Median Absolute Error (MAE):        677.623
Absolute Percentage Error (MAPE):     8.805

DESC
Mean Absolute Error (MAE):          2639.927
Median Absolute Error (MAE):        783.926
Absolute Percentage Error (MAPE):     9.956