In [1]:
import time
import os

import pandas as pd
import numpy as np
np.set_printoptions(precision=6, suppress=True)

from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras import *
tf.__version__

'2.3.0'

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

2 Physical GPUs, 2 Logical GPUs


In [3]:
from tensorflow.keras.metrics import Metric
class RSquare(Metric):
    """Compute R^2 score.
     This is also called as coefficient of determination.
     It tells how close are data to the fitted regression line.
     - Highest score can be 1.0 and it indicates that the predictors
       perfectly accounts for variation in the target.
     - Score 0.0 indicates that the predictors do not
       account for variation in the target.
     - It can also be negative if the model is worse.
     Usage:
     ```python
     actuals = tf.constant([1, 4, 3], dtype=tf.float32)
     preds = tf.constant([2, 4, 4], dtype=tf.float32)
     result = tf.keras.metrics.RSquare()
     result.update_state(actuals, preds)
     print('R^2 score is: ', r1.result().numpy()) # 0.57142866
    ```
    """

    def __init__(self, name='r_square', dtype=tf.float32):
        super(RSquare, self).__init__(name=name, dtype=dtype)
        self.squared_sum = self.add_weight("squared_sum", initializer="zeros")
        self.sum = self.add_weight("sum", initializer="zeros")
        self.res = self.add_weight("residual", initializer="zeros")
        self.count = self.add_weight("count", initializer="zeros")

    def update_state(self, y_true, y_pred):
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)
        self.squared_sum.assign_add(tf.reduce_sum(y_true**2))
        self.sum.assign_add(tf.reduce_sum(y_true))
        self.res.assign_add(
            tf.reduce_sum(tf.square(tf.subtract(y_true, y_pred))))
        self.count.assign_add(tf.cast(tf.shape(y_true)[0], tf.float32))

    def result(self):
        mean = self.sum / self.count
        total = self.squared_sum - 2 * self.sum * mean + self.count * mean**2
        return 1 - (self.res / total)

    def reset_states(self):
        # The state of the metric will be reset at the start of each epoch.
        self.squared_sum.assign(0.0)
        self.sum.assign(0.0)
        self.res.assign(0.0)
        self.count.assign(0.0)

In [4]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
plt.rcParams['figure.figsize'] = ((8/2.54), (6/2.54))
plt.rcParams["font.family"] = "Arial"
plt.rcParams["mathtext.default"] = "rm"
plt.rcParams.update({'font.size': 11})
MARKER_SIZE = 15
cmap_m = ["#f4a6ad", "#f6957e", "#fccfa2", "#8de7be", "#86d6f2", "#24a9e4", "#b586e0", "#d7f293"]
cmap = ["#e94d5b", "#ef4d28", "#f9a54f", "#25b575", "#1bb1e7", "#1477a2", "#a662e5", "#c2f442"]

plt.rcParams['axes.spines.top'] = False
# plt.rcParams['axes.edgecolor'] = 
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['xtick.major.width'] = 1
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.major.width'] = 1
plt.rcParams['ytick.minor.width'] = 1

# Model test

## hyperparameters

In [6]:
LOSS_RATES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
SCREEN_SIZE = 1
DISP_STEPS = 100
TRAINING_EPOCHS = 500
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DIRECTORY = '../data/'

In [7]:
class FFNN(Model):
    def __init__(self, out_len):
        super(FFNN, self).__init__()
        
        self.out_len = out_len
        self.dense1 = layers.Dense(64)
        self.bn1 = layers.BatchNormalization()
        self.dense2 = layers.Dense(64)
        self.out_layer = layers.Dense(self.out_len)
        
    def call(self, inp):
        output = self.dense1(inp)
        output = self.bn1(inp)
        output = self.dense2(inp)
        output = self.out_layer(inp)
        
        return output

In [8]:
ffnn_model = FFNN(5)
opt = tf.optimizers.Adam(learning_rate=LEARNING_RATE)

checkpoint_path = './checkpoints/FFNN_best_loss_0.30p/'
ckpt = tf.train.Checkpoint(ffnn_model=ffnn_model, opt=opt)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)
if ckpt_manager.checkpoints:
    ckpt.restore(ckpt_manager.checkpoints[-1])
    print ('Checkpoint ' + ckpt_manager.checkpoints[-1][-2:] +' restored.')

Checkpoint 33 restored.


## data loading

In [9]:
l = np.load('./data/tot_dataset_loss_0.30.npz')
MAXS = l['MAXS']
MINS = l['MINS']

In [10]:
TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

In [11]:
def index_selector(x, sr): #x: sorted count indices (argmax), sr: Screen size
    new_indices = []
    while len(x) > 0:
        new_indices.append(x[0])
        x = np.setdiff1d(x, np.arange(x[0] - sr, x[0] + sr), assume_unique=True)
    return np.array(new_indices)

In [12]:
for LOSS_RATE in LOSS_RATES:
    print('%.2f loss start.' % LOSS_RATE)
    for FILENAME in dataset_list:
        raw_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        raw_df = raw_df[~(raw_df == 0).all(axis=1)].interpolate()
        datetime = pd.DatetimeIndex(raw_df.index)
        env_df = raw_df.values
        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = LOSS_RATE
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
        missing_df = missing_df.filled()

        recover_df = missing_df.copy()
        recover_df = pd.DataFrame(recover_df, index=raw_df.index, columns=raw_df.columns)
        recover_df.index = pd.DatetimeIndex(recover_df.index)
        
        filling_indices = np.array([True])
        prev_filling_indices = np.array([False])
        error = 0
        
        temp_current = recover_df[1:-1].values # current
        temp_prev = recover_df[:-2].values # previous
        temp_next = recover_df[2:].values # next
        temp_mask = mask[1:-1]
        
        test_input = np.concatenate([temp_current, temp_mask, temp_prev, temp_next], axis=1)
        
        pred_result = ffnn_model.predict(test_input)

        masking = test_input[..., 5:10]
        final_pred = np.ma.array(pred_result, mask=masking, fill_value=np.nan).filled()

        pred_df = pd.DataFrame(final_pred.reshape(-1, 5),
                               index = pd.DatetimeIndex(datetime[1:-1]),
                               columns=raw_df.columns)
        recover_df[recover_df == -1] = np.nan
        recover_df = recover_df.combine_first(pred_df)
        
        ((MAXS[:5]-MINS[:5])*recover_df + MINS[:5]).to_csv('./results/recovered_%s_%.2fp_FFNN.csv' % (FILENAME.split('/')[-1].split('.')[0], LOSS_RATE))
        print('%s done.' % FILENAME.split('/')[-1].split('.')[0])

0.10 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.20 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.30 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.40 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.50