In [12]:
import torch
import numpy as np
seed = 37
torch.manual_seed(seed)
np.random.seed(seed)

import sys
sys.path.append("../../../")
from pipeline.dataset.dataset_loader import DatasetLoader
import pandas as pd
from darts.models import (
    TFTModel,
)
from darts.utils.likelihood_models import QuantileRegression
from darts.dataprocessing.transformers import Scaler
import matplotlib.pyplot as plt
from darts.metrics import rho_risk
import matplotlib.pyplot as plt
import tsaug
import ray
from ray import tune
import os
from matplotlib.pyplot import figure

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["LD_LIBRARY_PATH"] = "/home/jupyter-babakesistani/.conda/envs/thesis/lib/"

figure(figsize=(8, 4), dpi=120)


dl =  DatasetLoader( '../../data/alibaba_hourly_cpu_gpu_mem_usage.csv', "Time",
                                                                    [
                                                                      "cpu_usage",
                                                                      "gpu_wrk_util",
                                                                      "avg_gpu_wrk_mem"
                                                                     ],
                    resample_freq="H", augment=False, 
                    shift_df_datetime="2022-01-02 00:00:00",
                    fill_missing_dates=True)
train, val = dl.get_train_val_by_date("2022-02-14 23:59:00")

<Figure size 960x480 with 0 Axes>

In [13]:
def training_function(config):
    
    
    quantiles = [0.1, 0.5, 0.9]

    model = TFTModel(
        input_chunk_length=config["input_chunk_length"],
        output_chunk_length=config["output_chunk_length"],
        hidden_size = 64,
        lstm_layers= 4,
        num_attention_heads= 8,
        optimizer_kwargs= {'lr': 0.001},
        n_epochs=100,
        dropout=0.1,
#         save_checkpoints=True,
         add_encoders={
        'cyclic': {'past': ['dayofweek', 'hour'] , 'future': ['dayofweek', 'hour']},
        "datetime_attribute": {"past": ['day', "month"], "future": ['day', "month"] },
        'transformer': Scaler()
        },
        likelihood=QuantileRegression(
            quantiles=quantiles
        ), 
        work_dir="logs/tft_hparam",
        nr_epochs_val_period=1,
#         log_tensorboard=True,
        random_state=42,
        force_reset=True,
        torch_device_str='cuda:0'
    )
    
    
    model.fit(train, val_series=val)
    
    pred = model.predict(series=val[:config["input_chunk_length"]], n=config["output_chunk_length"], num_samples=100)
    rr = 0.0
    for rho in quantiles:
        rr += rho_risk(val[config["input_chunk_length"]:], pred, rho=rho)
    rr = rr / len(quantiles)
    
    tune.report(mean_loss=rr)

In [14]:
max_concurrent_trials = 8
analysis = tune.run(
    training_function,
    max_concurrent_trials=max_concurrent_trials,
    resources_per_trial={'cpu': 8.0/max_concurrent_trials, 'gpu': 1.0/max_concurrent_trials},
    local_dir="logs/lstm_hparam",
    progress_reporter=tune.JupyterNotebookReporter(overwrite=True),
    config={
        "input_chunk_length": tune.grid_search([1, 2, 3, 6, 9, 12]),
        "output_chunk_length": tune.grid_search([1, 2, 3, 6, 9, 12]),
#         "hidden_size": tune.grid_search([25, 75]),
# #         "n_rnn_layers": tune.grid_search([2]),
#         "batch_size": tune.grid_search([512]),
#         "dropout": tune.grid_search([0.1]),
    })

print("Best config: ", analysis.get_best_config(
    metric="mean_loss", mode="min"))

# Get a dataframe for analyzing trial results.
df = analysis.results_df

Trial name,status,loc,input_chunk_length,output_chunk_length,loss,iter,total time (s),neg_mean_loss
training_function_874ca_00000,TERMINATED,130.149.248.55:994517,1,1,0.132508,1,563.082,-0.132508
training_function_874ca_00001,TERMINATED,130.149.248.55:994520,2,1,0.128112,1,567.38,-0.128112
training_function_874ca_00002,TERMINATED,130.149.248.55:994518,3,1,0.207099,1,565.094,-0.207099
training_function_874ca_00003,TERMINATED,130.149.248.55:994515,6,1,0.0797991,1,583.976,-0.0797991
training_function_874ca_00004,TERMINATED,130.149.248.55:994519,9,1,0.399823,1,599.908,-0.399823
training_function_874ca_00005,TERMINATED,130.149.248.55:994516,12,1,0.455193,1,621.684,-0.455193
training_function_874ca_00006,TERMINATED,130.149.248.55:994521,1,2,0.165946,1,560.679,-0.165946
training_function_874ca_00007,TERMINATED,130.149.248.55:994514,2,2,0.200923,1,565.429,-0.200923
training_function_874ca_00008,TERMINATED,130.149.248.55:994513,3,2,0.215764,1,608.999,-0.215764
training_function_874ca_00009,TERMINATED,130.149.248.55:995219,6,2,0.243974,1,640.249,-0.243974


2022-04-07 19:42:13,090	INFO tune.py:636 -- Total run time: 3561.84 seconds (3561.70 seconds for the tuning loop).


Best config:  {'input_chunk_length': 6, 'output_chunk_length': 6}
