In [1]:
model_name = 'CVAE_mse'

In [2]:
!pip install scipy==1.11.1



In [3]:
from keras.layers import Input, Dense, Conv2D, Flatten, Reshape, Conv2DTranspose
from keras.models import Model
from keras.datasets import mnist
from keras.utils import to_categorical
from keras import backend as K
from keras.callbacks import Callback
import tensorflow as tf

from scipy.stats import entropy
from scipy.linalg import sqrtm
from scipy.stats import pearsonr

from keras.applications.inception_v3 import InceptionV3

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.models import load_model

import cv2

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import entropy
from scipy.spatial.distance import cosine

import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3
from scipy.linalg import sqrtm

In [4]:
import random

seed = 42

# Python's built-in random library
random.seed(seed)

# Scikit-learn (only affects some parts of scikit-learn)
from sklearn.utils import check_random_state
check_random_state(seed)

np.random.seed(seed)
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)

In [5]:
img_rows = 168
img_cols = 168
channels = 1

# Input image dimensions
img_shape = (img_rows, img_cols, channels)

In [6]:
# Set path for data source
import os
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/My Drive/Load Diffusion/dataset/")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
# Set the path to the folder containing saved files
load_path = "./preprocessed_data_168x168_2years"

# Load saved NumPy arrays
X_train = np.load(os.path.join(load_path, 'X_train.npy'))
X_test = np.load(os.path.join(load_path, 'X_test.npy'))
y_train = np.load(os.path.join(load_path, 'y_train.npy'))
y_test = np.load(os.path.join(load_path, 'y_test.npy'))

# Load saved Pandas DataFrames
train_data = pd.read_pickle(os.path.join(load_path, 'train_data.pkl'))
test_data = pd.read_pickle(os.path.join(load_path, 'test_data.pkl'))
metadata = pd.read_pickle(os.path.join(load_path, 'metadata.pkl'))
metadata_original = pd.read_pickle(os.path.join(load_path, 'metadata_original.pkl'))

In [8]:
def merge_dummified_columns(df, dummified_columns, new_column_name):
    """
    Merges dummified columns into a single column in a DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the dummified columns.
        dummified_columns (list): A list of the names of the dummified columns.
        new_column_name (str): The name of the new column to create.

    Returns:
        pd.DataFrame: The DataFrame with the dummified columns merged into a single new column.
    """
    # Create a new column containing the column name where the value is 1 for each row
    df[new_column_name] = df[dummified_columns].idxmax(axis=1)

    # Drop the dummified columns
    df.drop(dummified_columns, axis=1, inplace=True)

    return df

test_metadata = metadata_original.loc[test_data.columns]

test_metadata = merge_dummified_columns(test_metadata,
                     dummified_columns=['chilledwater', 'electricity', 'gas', 'hotwater', 'steam'],
                     new_column_name='meter_type')
test_metadata = merge_dummified_columns(test_metadata,
                     dummified_columns=['Education', 'Entertainment/public assembly', 'Lodging/residential', 'Office', 'Public services'],
                     new_column_name='building_type')

test_metadata


Unnamed: 0,lat,lng,year,meter_type,building_type
electricity_Lamb_office_Caitlin_2016,51.497838,-3.186246,2016,electricity,Office
electricity_Rat_public_Kelle_2016,38.903504,-77.005349,2016,electricity,Public services
electricity_Rat_assembly_Ezequiel_2016,38.903504,-77.005349,2016,electricity,Entertainment/public assembly
electricity_Bear_assembly_Roxy_2016,37.871903,-122.260729,2016,electricity,Entertainment/public assembly
gas_Shrew_office_Rose_2017,51.499840,-0.124663,2017,gas,Office
...,...,...,...,...,...
steam_Moose_education_Omar_2016,45.421500,-75.697200,2016,steam,Education
electricity_Rat_public_Chrissy_2017,38.903504,-77.005349,2017,electricity,Public services
electricity_Hog_office_Merilyn_2016,44.978782,-93.255398,2016,electricity,Office
steam_Bull_assembly_Amalia_2016,30.267200,-97.743100,2016,steam,Entertainment/public assembly


In [9]:
def calculate_rmse(true_data, predicted_data):
    return np.sqrt(mean_squared_error(true_data, predicted_data))

def calculate_mae(true_data, predicted_data):
    return mean_absolute_error(true_data, predicted_data)

def calculate_r2(true_data, predicted_data):
    return r2_score(true_data, predicted_data)

def calculate_kl_divergence(true_data, predicted_data):
    return entropy(true_data+0.00000001, predicted_data+0.00000001)

def calculate_cosine_similarity(true_data, predicted_data):
    return 1 - cosine(true_data, predicted_data)

In [10]:
def evaluate_model(test_data, generated_series):
    metrics_result = {
        'rmse': [],
        'mae': [],
        'r2': [],
        'kl_divergence': [],
        'cosine_similarity': [],
        'pearson_correlation': []
    }

    for i in range(test_data.shape[1]):
        try:
            true_data = test_data[:, i]
            predicted_data = generated_series[:, i]
        except:
            true_data = test_data.values[:, i]
            predicted_data = generated_series.values[:, i]

        # Calculate Pearson correlation
        pearson_corr, _ = pearsonr(true_data, predicted_data)

        metrics_result['rmse'].append(calculate_rmse(true_data, predicted_data))
        metrics_result['mae'].append(calculate_mae(true_data, predicted_data))
        metrics_result['r2'].append(calculate_r2(true_data, predicted_data))
        metrics_result['kl_divergence'].append(calculate_kl_divergence(true_data, predicted_data))
        metrics_result['cosine_similarity'].append(calculate_cosine_similarity(true_data, predicted_data))
        metrics_result['pearson_correlation'].append(pearson_corr)

    return metrics_result

# Function to calculate FID between two sets of images
def calculate_fid(model, images1, images2):
    # Calculate activations
    act1 = model.predict(images1)
    act2 = model.predict(images2)

    # Calculate mean and covariance statistics
    mu1, sigma1 = act1.mean(axis=0), np.cov(act1, rowvar=False)
    mu2, sigma2 = act2.mean(axis=0), np.cov(act2, rowvar=False)

    # Calculate sum squared difference between means
    ssdiff = np.sum((mu1 - mu2)**2.0)

    # Calculate sqrt of product between covariances
    covmean = sqrtm(sigma1.dot(sigma2))

    # Check and correct imaginary numbers from sqrt
    if np.iscomplexobj(covmean):
        covmean = covmean.real

    # Calculate the FID score
    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid

In [11]:
# Specify the path where the datasets are saved
generation_path = "./generation/" + model_name + '_168x168'

# Initialize an empty DataFrame to store metrics
df_metrics = []

for file_name in [item for item in os.listdir(generation_path) if item.startswith('generated_images_seed')]:
  # Load the dataset
  print(file_name)
  generated_images = np.load(os.path.join(generation_path, file_name))
  images_seed = int(file_name.split('.')[0].split('_')[-1])

  # Set the target shape
  target_shape = (generated_images.shape[0], 52, 168, 1)

  # Initialize an array to hold the resized images
  resized_images = np.zeros(target_shape)

  # Loop through each image to resize it
  for i in range(generated_images.shape[0]):
      resized_image = cv2.resize(generated_images[i], (168, 52))  # Note the dimensions are (width, height)
      resized_images[i] = np.expand_dims(resized_image, axis=-1)  # Add back the last dimension

  # Assume resized_images has a shape of (905, 52, 168, 1)
  flattened_images = resized_images.reshape(generated_images.shape[0], -1).T  # Transpose to get shape (8736, generated_images.shape[0])

  generated_series = pd.DataFrame(flattened_images)
  generated_series.columns = test_data.columns

  generated_series = (generated_series-generated_series.min())/(generated_series.max()-generated_series.min())
  test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())

  test_data_resampled = test_data.rolling(24).mean()[23::24]
  generated_series_resampled = generated_series.rolling(24).mean()[23::24]

  evaluation_results = evaluate_model(test_data, generated_series)
  evaluation_results_resampled = evaluate_model(test_data_resampled, generated_series_resampled)

  df_evaluation_results = pd.DataFrame(evaluation_results)
  df_evaluation_results_resampled = pd.DataFrame(evaluation_results_resampled)
  df_evaluation_results_resampled.columns = df_evaluation_results_resampled.columns+'_daily_avg'

  # Prepare the Inception v3 model
  model = InceptionV3(include_top=False, pooling='avg', weights="imagenet", input_shape=(99,99,3))

  # Convert to RGB and upscale to the necessary input size for the Inception model
  X_test_rgb = tf.image.grayscale_to_rgb(tf.convert_to_tensor(X_test))
  generated_images_rgb = tf.image.grayscale_to_rgb(tf.convert_to_tensor(generated_images))

  X_test_upscaled = tf.image.resize(X_test_rgb, [99, 99])
  generated_images_upscaled = tf.image.resize(generated_images_rgb, [99, 99])

  # Calculate FID
  fid_score = calculate_fid(model, X_test_upscaled, generated_images_upscaled)
  print('FID: '+str(fid_score))

  metrics = pd.concat([df_evaluation_results.mean(), df_evaluation_results_resampled.mean()])
  metrics['fid_score'] = fid_score
  metrics['images_seed'] = images_seed
  metrics['model'] = model_name
  metrics = metrics.to_frame().T

  # Append the metrics to df_metrics
  df_metrics.append(metrics)

generated_images_seed_1.npy
FID: 943.8997217467477
generated_images_seed_2.npy
FID: 950.0023913314024
generated_images_seed_3.npy
FID: 954.7249892711368
generated_images_seed_4.npy
FID: 922.9765406772386
generated_images_seed_5.npy
FID: 946.2291509290717
generated_images_seed_6.npy
FID: 954.7388571774534
generated_images_seed_7.npy
FID: 950.9189169755365
generated_images_seed_8.npy
FID: 945.2703776499475
generated_images_seed_9.npy
FID: 933.0082830456026
generated_images_seed_10.npy
FID: 953.5179391045322
generated_images_seed_11.npy
FID: 938.8493044822949
generated_images_seed_12.npy
FID: 942.6499447530638
generated_images_seed_13.npy
FID: 956.804778857194
generated_images_seed_14.npy
FID: 944.6020794201584
generated_images_seed_15.npy
FID: 944.4963603968007
generated_images_seed_16.npy
FID: 941.7445229389566
generated_images_seed_17.npy
FID: 938.8715912171656
generated_images_seed_18.npy
FID: 937.0572584092405
generated_images_seed_19.npy
FID: 971.1960588751617
generated_images_seed_

In [12]:
df_metrics = pd.concat(df_metrics, ignore_index=True)
df_metrics

Unnamed: 0,rmse,mae,r2,kl_divergence,cosine_similarity,pearson_correlation,rmse_daily_avg,mae_daily_avg,r2_daily_avg,kl_divergence_daily_avg,cosine_similarity_daily_avg,pearson_correlation_daily_avg,fid_score,images_seed,model
0,0.258466,0.21079,-3.969388,0.455561,0.772334,0.351359,0.207696,0.175321,-69.883774,0.29105,0.840831,0.331587,943.899722,1.0,CVAE_mse
1,0.26031,0.213112,-3.169984,0.459104,0.769955,0.344622,0.208965,0.177203,-47.161156,0.292518,0.83893,0.323282,950.002391,2.0,CVAE_mse
2,0.259448,0.211934,-2.652959,0.455319,0.772245,0.35019,0.208889,0.176443,-38.684801,0.292948,0.838754,0.322743,954.724989,3.0,CVAE_mse
3,0.25636,0.209263,-2.616288,0.452215,0.773616,0.352346,0.205994,0.173661,-35.084782,0.28985,0.840365,0.328916,922.976541,4.0,CVAE_mse
4,0.257445,0.210197,-2.655894,0.449045,0.775444,0.362717,0.207305,0.175191,-37.910288,0.284989,0.843379,0.340809,946.229151,5.0,CVAE_mse
5,0.257744,0.210114,-2.771796,0.453282,0.773415,0.352731,0.207042,0.174645,-39.689465,0.28853,0.841722,0.332532,954.738857,6.0,CVAE_mse
6,0.258673,0.211351,-2.917075,0.456499,0.772843,0.35693,0.208143,0.176076,-41.189101,0.292665,0.840258,0.338983,950.918917,7.0,CVAE_mse
7,0.25938,0.211985,-3.201531,0.455643,0.772015,0.349538,0.209877,0.17743,-49.392402,0.292947,0.838624,0.319999,945.270378,8.0,CVAE_mse
8,0.258008,0.211309,-2.725111,0.453468,0.773472,0.348279,0.208841,0.176579,-40.014183,0.291423,0.839946,0.320422,933.008283,9.0,CVAE_mse
9,0.258851,0.211121,-3.351192,0.453145,0.772833,0.354481,0.20809,0.175601,-52.829711,0.287568,0.841677,0.330217,953.517939,10.0,CVAE_mse


In [13]:
generation_path = "./evaluations"
df_metrics.to_csv(os.path.join(generation_path, model_name+'_metrics.csv'),index=False)

In [14]:
os.path.join(generation_path, model_name+'_metrics.csv')

'./evaluations/CVAE_mse_metrics.csv'

In [23]:
df_metrics.drop('model',axis=1).astype('float').describe()[['fid_score','kl_divergence','pearson_correlation','rmse']]

Unnamed: 0,fid_score,kl_divergence,pearson_correlation,rmse
count,30.0,30.0,30.0,30.0
mean,946.762909,0.453632,0.352599,0.258347
std,10.138314,0.003815,0.007058,0.001666
min,922.976541,0.443798,0.340827,0.255237
25%,941.970878,0.452053,0.347123,0.257226
50%,945.941029,0.453526,0.352538,0.258306
75%,953.187086,0.455622,0.355891,0.259432
max,971.196059,0.460888,0.370658,0.261413
