# **Default Setting**

* Python: 3.6.9
* ML Framework: tf-nightly-gpu 2.5.0-dev20201208
* CPU: AMD Ryzen 5 5600X 6-Core Processor
* GPU: GeForce RTX 3070 (8G) (CUDA 11.1)
* RAM: 32G
* Platform: linux 20.04 LTS

In [1]:
%env NOTEBOOKNAME try9
%env LOCAL_DATA_PATH data
%env SUBMISSION_PATH submission

env: NOTEBOOKNAME=try9
env: LOCAL_DATA_PATH=data
env: SUBMISSION_PATH=submission


In [20]:
import tensorflow as tf
import tensorflow_addons as tfa

import datetime
import glob
import os
import platform

import numpy as np
import pandas as pd

from collections import OrderedDict
from math import pi
from sklearn.model_selection import train_test_split

print(f"tf.__version__: {tf.__version__}")
print(f"tfa.__version__: {tfa.__version__}")

tf.__version__: 2.5.0-dev20201208
tfa.__version__: 0.11.2


In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# !python -V

In [5]:
# !cat /proc/cpuinfo

In [6]:
!nvidia-smi

Sat Dec 12 18:43:56 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Graphics Device     On   | 00000000:0A:00.0  On |                  N/A |
|  0%   40C    P8    24W / 220W |    370MiB /  7979MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
# !df -h

In [8]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       3.7Gi        23Gi       485Mi       3.8Gi        26Gi
Swap:         2.0Gi          0B       2.0Gi


In [9]:
# platform.platform()

In [10]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard

# **Define Arguments**

In [11]:
args = OrderedDict({
    # Basic
    "SEED": 42,
    "TEST_SIZE": 0.2,
    "EPOCH": 10,
    "INIT_LR": 1e-3,
    
    # Dataframe
    "NUM_TEST_CSV": 81,
    
    # Dataset Pipeline
    "WINDOW_INP": 7 * 24 * 2, # 336, 7 days
    "WINDOW_TAR": 2 * 24 * 2, # 96, 2 days
    "WINDOW_SHIFT": 1 * 24 * 2, # 1 day
    "WINDOW_STRIDE": 1,
    "WINDOW_DR": True, # Drop remainders
    "AUTO": tf.data.experimental.AUTOTUNE,
    
    "GLOBAL_BATCH_SIZE": 128,
    
    # Model
    "NUM_UNITS": 128, # lstm units
    "NUM_FEATURES": 6, # dense featues
    "NUM_QUANTILE": 9,
    
})

args

OrderedDict([('SEED', 42),
             ('TEST_SIZE', 0.2),
             ('EPOCH', 10),
             ('INIT_LR', 0.001),
             ('NUM_TEST_CSV', 81),
             ('WINDOW_INP', 336),
             ('WINDOW_TAR', 96),
             ('WINDOW_SHIFT', 48),
             ('WINDOW_STRIDE', 1),
             ('WINDOW_DR', True),
             ('AUTO', -1),
             ('GLOBAL_BATCH_SIZE', 128),
             ('NUM_UNITS', 128),
             ('NUM_FEATURES', 6),
             ('NUM_QUANTILE', 9)])

# **Load Datasets**

## **Train / Validation**

In [50]:
df = pd.read_csv(os.path.join(os.environ["LOCAL_DATA_PATH"], "train", "train.csv"))

In [51]:
# df.head()

In [52]:
# df.describe()

In [78]:
tr_df, vl_df = train_test_split(
    df, 
    test_size = args["TEST_SIZE"], 
    random_state = args["SEED"])

tr_df.shape, vl_df.shape

((42048, 9), (10512, 9))

In [79]:
def preprocessing(df, is_training = True, day = 24 * 2):
    # Drop timestampes.
    try:
        df = df.drop(["Day"], axis = 1) # cannot use day features
    except:
        pass
    
    
    # Timestamp to sin/cos.
    for column in ["Hour", "Minute"]:
        df[f"_{column}"] = df[column].apply(lambda x: f"{x:0>2d}")
    
    df["Time"] = df[["_Hour", "_Minute"]].apply(lambda x: ":".join(x), axis = 1)
    
    timestamp = pd.to_datetime(df.pop("Time"), format = "%H:%M").map(datetime.datetime.timestamp)
    
    day_sin = tf.math.sin(tf.constant(timestamp.values, dtype = tf.float32) * (2. * pi / day))
    day_cos = tf.math.cos(tf.constant(timestamp.values, dtype = tf.float32) * (2. * pi / day))
    
    df.insert(loc = 0, column = "Day_sin", value = day_sin)
    df.insert(loc = 1, column = "Day_cos", value = day_cos)
    
    df = df.drop(["Hour", "Minute", "_Hour", "_Minute"], axis = 1)
    
    
    # Standarize.
    for column in df.columns:
        if is_training:
            args[f"{column}_MEAN"] = df[column].mean()
            args[f"{column}_STD"] = df[column].std()
        
        df[column] = (df[column] - args[f"{column}_MEAN"]) / args[f"{column}_STD"]
        
    return df

In [80]:
tr_df = preprocessing(tr_df)
vl_df = preprocessing(vl_df, is_training = False)

In [81]:
tr_df

Unnamed: 0,Day_sin,Day_cos,DHI,DNI,WS,RH,T,TARGET
36155,1.065658,-0.948410,-0.619163,-0.670927,-0.950662,0.712244,-1.400211,-0.690053
42664,1.289056,0.674420,-0.619163,-0.670927,-0.459068,0.013638,0.761006,-0.690053
35439,0.611017,1.284300,-0.619163,-0.670927,3.614135,1.019866,-0.516077,-0.690053
1647,0.611017,1.284300,-0.561089,-0.670927,1.647761,1.412661,-1.203737,-0.668124
6206,0.747367,-1.192170,0.958516,-0.459386,-0.037703,0.746222,-0.909025,0.011654
...,...,...,...,...,...,...,...,...
11284,-0.308149,-1.343083,-0.619163,-0.670927,-0.459068,-0.332493,0.269820,-0.690053
44732,-0.332989,1.361497,-0.619163,-0.670927,-0.880434,1.790506,0.761006,-0.690053
38158,1.458077,-0.086894,-0.619163,-0.670927,1.156167,-0.581672,-1.596686,-0.690053
860,-0.332989,1.361497,-0.619163,-0.670927,0.594346,0.693215,-0.909025,-0.690053


In [82]:
tr_df.describe()

Unnamed: 0,Day_sin,Day_cos,DHI,DNI,WS,RH,T,TARGET
count,42048.0,42048.0,42048.0,42048.0,42048.0,42048.0,42048.0,42048.0
mean,5.987681e-09,-9.435134e-09,-1.0645970000000001e-17,-1.1997840000000001e-17,1.203164e-16,2.484061e-16,-5.660955e-17,-1.422843e-16
std,0.9999999,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.411554,-1.379975,-0.6191629,-0.6709268,-1.723166,-2.229873,-2.775532,-0.6900532
25%,-0.89503,-0.9579576,-0.6191629,-0.6709268,-0.739979,-0.7769368,-0.8107883,-0.6900532
50%,-0.1591435,0.03684847,-0.6191629,-0.6709268,-0.1781579,0.03674375,-0.02489105,-0.6900532
75%,1.065658,0.9906173,0.2229109,0.6726452,0.5241185,0.7271943,0.7610062,0.5560395
max,1.46149,1.404771,4.491354,2.356398,6.704151,1.956776,2.529275,3.194405


In [83]:
vl_df.describe()

Unnamed: 0,Day_sin,Day_cos,DHI,DNI,WS,RH,T,TARGET
count,10512.0,10512.0,10512.0,10512.0,10512.0,10512.0,10512.0,10512.0
mean,-0.018077,-0.00077,0.018122,0.001328,0.008241,-0.003595,0.013041,0.013841
std,0.994606,1.004529,1.02772,0.998199,1.010278,0.995579,1.000124,1.01573
min,-1.411554,-1.379975,-0.619163,-0.670927,-1.652938,-2.211298,-2.579057,-0.690053
25%,-0.914538,-1.057862,-0.619163,-0.670927,-0.739979,-0.771953,-0.810788,-0.690053
50%,-0.184493,-0.000889,-0.619163,-0.670927,-0.178158,0.033572,-0.024891,-0.690053
75%,1.04785,1.066183,0.242269,0.655493,0.524119,0.701823,0.761006,0.570552
max,1.46149,1.404771,4.442959,2.350681,6.563695,1.956776,2.529275,3.200901


## **Test**

In [84]:
ts_filenames = [os.path.join(os.environ["LOCAL_DATA_PATH"], "test", f"{i}.csv") for i in range(args["NUM_TEST_CSV"])]
ts_dfs = [preprocessing(pd.read_csv(ts_filename), is_training = False) for ts_filename in ts_filenames]
ts_df = pd.concat(ts_dfs)

In [85]:
ts_df.head()

Unnamed: 0,Day_sin,Day_cos,DHI,DNI,WS,RH,T,TARGET
0,-1.411554,-0.000889,-0.619163,-0.670927,0.17298,-1.014336,-0.909025,-0.690053
1,0.09569,-1.379386,-0.619163,-0.670927,0.17298,-1.025662,-0.899202,-0.690053
2,1.45603,0.136017,-0.619163,-0.670927,0.17298,-1.022944,-0.889378,-0.690053
3,-0.914538,1.066183,-0.619163,-0.670927,0.17298,-1.033817,-0.879554,-0.690053
4,-0.308149,-1.343083,-0.619163,-0.670927,0.243208,-1.034723,-0.869731,-0.690053


In [86]:
ts_df.describe()

Unnamed: 0,Day_sin,Day_cos,DHI,DNI,WS,RH,T,TARGET
count,27216.0,27216.0,27216.0,27216.0,27216.0,27216.0,27216.0,27216.0
mean,-0.003615,-0.000154,-0.047397,0.046952,-0.127593,-0.471834,0.048424,0.020077
std,0.998949,1.000907,0.886646,1.023778,0.893017,0.868807,1.108719,1.017022
min,-1.411554,-1.379975,-0.619163,-0.670927,-1.652938,-2.351291,-2.451349,-0.690053
25%,-0.899907,-0.982934,-0.619163,-0.670927,-0.810207,-1.165768,-0.869731,-0.690053
50%,-0.171818,0.01798,-0.561089,-0.670927,-0.318613,-0.452099,-0.054362,-0.668126
75%,1.052302,1.009509,0.184195,0.775557,0.313436,0.184439,0.888715,0.577827
max,1.46149,1.404771,4.288095,2.413571,6.001874,1.620159,2.696278,3.182867


# **Make Dataset Pipelines**

In [87]:
@tf.function
def _split_window(features):
    return tf.split(features, [-1, 1], axis = 0)

In [88]:
tr_tensor = tf.constant(tr_df, dtype = tf.float32)
vl_tensor = tf.constant(vl_df, dtype = tf.float32)
ts_tensor = tf.constant(ts_df, dtype = tf.float32)

tr_dataset = tf.data.Dataset.from_tensor_slices(tr_tensor
#                     ).window(args["WINDOW_INP"] + args["WINDOW_TAR"], args["WINDOW_SHIFT"], args["WINDOW_STRIDE"], args["WINDOW_DR"]).flat_map(_flat_fn_tr
                    ).map(_split_window, num_parallel_calls = args["AUTO"]
                    ).batch(args["GLOBAL_BATCH_SIZE"]
                    ).cache(
                    ).prefetch(args["AUTO"])

vl_dataset = tf.data.Dataset.from_tensor_slices(vl_tensor
#                     ).window(args["WINDOW_INP"] + args["WINDOW_TAR"], args["WINDOW_SHIFT"], args["WINDOW_STRIDE"], args["WINDOW_DR"]).flat_map(_flat_fn_tr
                    ).map(_split_window, num_parallel_calls = args["AUTO"]
                    ).batch(args["GLOBAL_BATCH_SIZE"]
                    ).cache(
                    ).prefetch(args["AUTO"])

ts_dataset = tf.data.Dataset.from_tensor_slices(ts_tensor
#                     ).window(args["WINDOW_INP"], args["WINDOW_INP"], args["WINDOW_STRIDE"], args["WINDOW_DR"]).flat_map(_flat_fn_ts
                    ).map(_split_window, num_parallel_calls = args["AUTO"]
                    ).batch(args["GLOBAL_BATCH_SIZE"]
#                     ).cache(
                    ).prefetch(args["AUTO"])

# Print the shapes
print(f"tr_dataset: {tr_dataset.element_spec}")
print(f"vl_dataset: {vl_dataset.element_spec}")
print(f"ts_dataset: {ts_dataset.element_spec}")

tr_dataset: (TensorSpec(shape=(None, 7), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))
vl_dataset: (TensorSpec(shape=(None, 7), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))
ts_dataset: (TensorSpec(shape=(None, 7), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None))


## **Take Samples**

In [89]:
for element in tr_dataset.take(1):
    foo, bar = element
    print(foo.shape, bar.shape)

(128, 7) (128, 1)


In [90]:
for element in vl_dataset.take(1):
    foo, bar = element
    print(foo.shape, bar.shape)

(128, 7) (128, 1)


In [91]:
for element in ts_dataset.take(1):
    foo, bar = element
    print(foo.shape, bar.shape)

(128, 7) (128, 1)


# **Create Network**

In [92]:
class TARGET_MODEL(tf.keras.Model):
    def __init__(self, units, name):
        super(TARGET_MODEL, self).__init__(name = name)
        self.units = units
        
        self.dense1 = tf.keras.layers.Dense(units = self.units, activation = "relu")
        self.dense2 = tf.keras.layers.Dense(units = self.units, activation = "relu")
        self.dense3 = tf.keras.layers.Dense(units = 1)
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        
        return x        

# **Compile and Fit**

   * Repeat the last 24 hours.

In [93]:
model = TARGET_MODEL(
    units = 64,
    name = "model")

model.compile(
    loss = tf.keras.losses.MAE,
    optimizer = tf.keras.optimizers.Adam())

In [94]:
# TensorBoard callback.
log_dir = os.path.join(
    "logs", "fit", os.environ["NOTEBOOKNAME"], 
    datetime.datetime.now().strftime(f"{model.name}-%Y%m%d-%H%M%S"))
tb_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq = 1)

_ = model.fit(
    tr_dataset,
    validation_data = vl_dataset,
    epochs = 500,
    verbose = 0,
    callbacks = [tb_callback])

In [95]:
%tensorboard --logdir logs/fit --host jupyter-server

Reusing TensorBoard on port 6006 (pid 3556), started -1 day, 18:38:22 ago. (Use '!kill 3556' to kill it.)

# **Predict**

In [38]:
model.evaluate(ts_dataset, verbose = 1)



0.6481271386146545

In [57]:
preds = [np.array(model.predict(ts_dataset))[..., -1] for model in models]
preds[0].shape



(81, 96)

In [58]:
submission = pd.read_csv(os.path.join(os.environ["LOCAL_DATA_PATH"], "sample_submission.csv"), index_col = "id")
submission.head()

Unnamed: 0_level_0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
for column, pred in zip(submission.columns, preds):
    foo = np.reshape(pred, (-1,))
    foo = np.where(foo < 0, 0, foo) # clip by value
    
    submission[column] = foo

In [66]:
submission.head()

Unnamed: 0_level_0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.csv_Day7_0h00m,0.019628,0.009365,0.0,0.0,0.112591,7.637287,14.906515,17.131758,17.608368
0.csv_Day7_0h30m,0.008961,0.009944,0.0,0.0,0.124864,7.905676,15.397655,17.560627,17.959702
0.csv_Day7_1h00m,0.001013,0.008628,0.0,0.004815,0.133084,8.147173,15.50468,17.658821,18.036112
0.csv_Day7_1h30m,0.0,0.007088,0.0,0.01022,0.139264,8.29487,15.531036,17.690212,18.056557
0.csv_Day7_2h00m,0.0,0.005681,0.0,0.013636,0.14386,8.387443,15.53462,17.70113,18.060827


In [67]:
submission.to_csv(
    os.path.join(os.environ["SUBMISSION_PATH"], f"{os.environ['NOTEBOOKNAME']}_submission.csv"))