In [1]:
import torch
import numpy as np
seed = 37
torch.manual_seed(seed)
np.random.seed(seed)

import sys
sys.path.append("../../../")
from pipeline.dataset.dataset_loader import DatasetLoader
import pandas as pd
from darts.models import (
    TFTModel,
)
from darts.utils.likelihood_models import QuantileRegression
from darts.dataprocessing.transformers import Scaler
import matplotlib.pyplot as plt
from darts.metrics import rho_risk
import matplotlib.pyplot as plt
import tsaug
import ray
from ray import tune
import os
from matplotlib.pyplot import figure

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["LD_LIBRARY_PATH"] = "/home/jupyter-babakesistani/.conda/envs/thesis/lib/"

figure(figsize=(8, 4), dpi=120)


dl =  DatasetLoader( '../../data/alibaba_hourly_cpu_gpu_mem.csv', "Time",
                                                                    [
                                                                      "plan_cpu",
                                                                      "plan_gpu",
                                                                      "plan_mem"
                                                                     ],
                    resample_freq="H", augment=False, 
                    shift_df_datetime="2022-01-04 16:06:00",
                    fill_missing_dates=True)
train, val = dl.get_train_val_by_date("2022-02-28 23:59:00")

<Figure size 960x480 with 0 Axes>

In [3]:
def training_function(config):
    
    
    quantiles = [0.1, 0.5, 0.9]

    model = TFTModel(
        input_chunk_length=config["input_chunk_length"],
        output_chunk_length=config["output_chunk_length"],
        hidden_size = 64,
        lstm_layers= 4,
        num_attention_heads= 8,
        optimizer_kwargs= {'lr': 0.001},
        n_epochs=100,
        dropout=0.1,
#         save_checkpoints=True,
         add_encoders={
        'cyclic': {'past': ['dayofweek', 'hour'] , 'future': ['dayofweek', 'hour']},
        "datetime_attribute": {"past": ['day', "month"], "future": ['day', "month"] },
        'transformer': Scaler()
        },
        likelihood=QuantileRegression(
            quantiles=quantiles
        ), 
        work_dir=".",
        nr_epochs_val_period=1,
#         log_tensorboard=True,
        random_state=42,
        force_reset=True,
        torch_device_str='cuda:0'
    )
    
    
    model.fit(train, val_series=val)
    
    pred = model.predict(series=val[:config["input_chunk_length"]], n=config["output_chunk_length"], num_samples=100)
    rr = 0.0
    for rho in quantiles:
        rr += rho_risk(val[config["input_chunk_length"]:], pred, rho=rho)
    rr = rr / len(quantiles)
    
    tune.report(mean_loss=rr)

In [4]:
max_concurrent_trials = 8
analysis = tune.run(
    training_function,
    max_concurrent_trials=max_concurrent_trials,
    resources_per_trial={'cpu': 8.0/max_concurrent_trials, 'gpu': 1.0/max_concurrent_trials},
    local_dir="logs/tft_req_hparam",
    progress_reporter=tune.JupyterNotebookReporter(overwrite=True),
    config={
        "input_chunk_length": tune.grid_search([1, 2, 3, 6, 9, 12]),
        "output_chunk_length": tune.grid_search([1, 2, 3, 6, 9, 12]),
#         "hidden_size": tune.grid_search([25, 75]),
# #         "n_rnn_layers": tune.grid_search([2]),
#         "batch_size": tune.grid_search([512]),
#         "dropout": tune.grid_search([0.1]),
    })

print("Best config: ", analysis.get_best_config(
    metric="mean_loss", mode="min"))

# Get a dataframe for analyzing trial results.
df = analysis.results_df

Trial name,status,loc,input_chunk_length,output_chunk_length,loss,iter,total time (s),neg_mean_loss
training_function_96955_00000,TERMINATED,130.149.248.55:1001813,1,1,0.140928,1,732.783,-0.140928
training_function_96955_00001,TERMINATED,130.149.248.55:1001806,2,1,0.0890001,1,734.899,-0.0890001
training_function_96955_00002,TERMINATED,130.149.248.55:1001812,3,1,0.115122,1,738.248,-0.115122
training_function_96955_00003,TERMINATED,130.149.248.55:1001808,6,1,0.378776,1,754.305,-0.378776
training_function_96955_00004,TERMINATED,130.149.248.55:1001800,9,1,0.426664,1,773.48,-0.426664
training_function_96955_00005,TERMINATED,130.149.248.55:1001810,12,1,0.225236,1,791.085,-0.225236
training_function_96955_00006,TERMINATED,130.149.248.55:1001811,1,2,0.102632,1,733.758,-0.102632
training_function_96955_00007,TERMINATED,130.149.248.55:1001828,2,2,0.139382,1,733.663,-0.139382
training_function_96955_00008,TERMINATED,130.149.248.55:1001803,3,2,0.245859,1,999.469,-0.245859
training_function_96955_00009,TERMINATED,130.149.248.55:1001825,6,2,0.209386,1,1011.79,-0.209386


2022-04-07 22:21:22,871	INFO tune.py:636 -- Total run time: 4925.58 seconds (4925.37 seconds for the tuning loop).


Best config:  {'input_chunk_length': 9, 'output_chunk_length': 9}


