In [None]:
import os
import pandas as pd
import numpy as np
import json
import pickle
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

from models.import_plotting import *
from models.types import *
from train.utils import *
from train.predictors import *

USE_GPU = True
SCALING_STATS_NAME = "sts_server1_cpu.json"
NUM_EPOCHS = 10
LM_LOSS_WT = 1e-1
LM_LOSS_WT_DECAY = 0.9
MODEL = "boot-setrepr-hydra"
# MODEL = "boot-nn"
USE_BASE_PARAM = True
SAVE_POSTFIX = f"{MODEL}"
BOOTSTRAP = True
SAVE_RES = True
SAVE_RES_PF = ""
SAVE_FIGS = False
# APPLICATION_CATALOG = ApplicationCatalog.E
APPLICATION_CATALOG = ApplicationCatalog.E
LOAD_WEIGHTS = False
SKIP_VAL = False

DS_TRAIN = [
    f"{HOME_PATH}/ds_folder_1"
    f"{HOME_PATH}/ds_folder_2"
    f"{HOME_PATH}/ds_folder_3"
]
SKIP_DATA_PARSING = False

DS_VAL = f"{HOME_PATH}/folder_val"


SAVE_PATH = f"{HOME_PATH}/results/models_mp75p95/neuro_t"

LOAD_WEIGHTS_PATH = f"{HOME_PATH}/server_2/results/models_mp75p95_adap2/neuro"

## Parse Raw Data


In [None]:
if not SKIP_DATA_PARSING:
    ds_runs = get_runs(DS_VAL)

    pickle.dump(ds_runs, open(os.path.join(DS_VAL, f"data_runs.pkl"), "wb"))

    dps = get_datapoints(ds_runs, use_gpu=USE_GPU)

    pickle.dump(dps, open(os.path.join(DS_VAL, f"data.pkl"), "wb"))

    n_dps = len(dps)

    print(n_dps)

In [None]:
if not SKIP_DATA_PARSING:
    n_dps = 0
    for ds_train in DS_TRAIN:
        ds_runs_train = get_runs(ds_train)

        print(len(ds_runs_train))

        pickle.dump(ds_runs_train, open(os.path.join(ds_train, f"data_runs.pkl"), "wb"))

        dps = get_datapoints(ds_runs_train, use_gpu=USE_GPU)

        pickle.dump(dps, open(os.path.join(ds_train, f"data.pkl"), "wb"))

        n_dps += len(dps)

        print(n_dps)

## Read all parsed data

In [None]:
ds_runs_train = []
for ds_train in DS_TRAIN:
    ds = pickle.load(open(os.path.join(ds_train, f"data_runs.pkl"), "rb"))
    print(len(ds))

    ds_runs_train.append(ds)

In [None]:
ds_runs_val = pickle.load(open(os.path.join(DS_VAL, f"data_runs.pkl"), "rb"))
print(len(ds_runs_val))

## Create Datapoints


In [None]:
datapoints_train = []
for ds_train in DS_TRAIN:
    dps = pickle.load(
        open(os.path.join(ds_train, f"data.pkl.nb" if not BOOTSTRAP else "data.pkl"), "rb")
    )
    datapoints_train.append(dps)
    print(len(datapoints_train))

In [None]:
datapoints_val = pickle.load(open(os.path.join(DS_VAL, "data.pkl"), "rb"))
print(len(datapoints_val))

## Transform


In [None]:
def get_ds(ds_folder, split=False, use_pairwise=False, bootstrap=False, flag=False):
    if not isinstance(ds_folder, list):
        ds_folder = [ds_folder]

    seed = 0
    th.manual_seed(seed)

    ds_list = []
    for folder in ds_folder:
        match MODEL:
            case "boot-setrepr":
                ds = CustomDatasetSetRepr(
                    path=os.path.join(folder, f"data.pkl.nb" if not bootstrap else "data.pkl")
                )
            case "boot-setrepr-hydra":
                ds = CustomDatasetSetReprHydra(
                    path=os.path.join(folder, f"data.pkl.nb" if not bootstrap else "data.pkl"),
                    use_base_param=USE_BASE_PARAM,
                    scaling_stats_name=SCALING_STATS_NAME,
                    use_gpu=USE_GPU,
                )
            case "boot-nn":
                ds = CustomDataset(path=os.path.join(folder, "data.pkl"))
            case _:
                print("Incorrect model name")

        ds_list.append(ds)

    ds = th.utils.data.ConcatDataset(ds_list)

    if flag:
        total_size = len(ds)
        split_size = int(0.4 * total_size)  # 30% of the dataset
        _, ds = random_split(ds, [total_size - split_size, split_size])

    if split:
        frac = 0.2
        train_ds, val_ds = random_split(ds, [len(ds) - int(frac * len(ds)), int(frac * len(ds))])
        train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=16, shuffle=True)

        return train_ds, train_dl, val_ds, val_dl
    else:
        return ds, DataLoader(ds, batch_size=16, shuffle=True)

In [None]:
# train_ds, train_dl, val_ds, val_dl = get_ds(DS_TRAIN, split=True, use_pairwise=DS_USE_PAIRWISE)

train_ds, train_dl = get_ds(
    DS_TRAIN,
    split=False,
    bootstrap=BOOTSTRAP,
    flag=True and BOOTSTRAP,
)
val_ds, val_dl = get_ds(DS_VAL, split=False, use_pairwise=False, bootstrap=True)

print(train_ds[0], val_ds[0])

# if SAVE_RES:
#     json.dump(
#         {"train_indices": train_ds.indices, "val_indices": val_ds.indices},
#         open(os.path.join(DS_FOLDER, "data_indices.json"), "w"),
#         indent=4,
#     )

## Compute Normalizing constants


<span style="color: red;">❗❗❗</span> Comment out scaling in the dataset before running this section (manual step)


In [None]:
# app_max_exec = {a: [0 for _ in range(len(APP_INFERENCE_TARGETS))] for a in APPLICATIONS_E}
# app_min_exec = {a: [100 for _ in range(len(APP_INFERENCE_TARGETS))] for a in APPLICATIONS_E}
# app_exec = {a: [[] for _ in range(len(APP_INFERENCE_TARGETS))] for a in APPLICATIONS_E}

# for ix in range(3):
#     for d in train_ds:
#         a = d[0]["x_features"][0]
#         a = APPLICATIONS_E[int(a)]
#         ts = d[1][0][ix]


#         app_exec[a][ix].append(ts.item())
#         if ts > app_max_exec[a][ix]:
#             app_max_exec[a][ix] = ts.item()

#         if ts < app_min_exec[a][ix] and ts != 0:
#             app_min_exec[a][ix] = ts.item()

# print(app_max_exec)
# print(app_min_exec)

# for (k1, v1), (k2, v2) in zip(app_max_exec.items(), app_min_exec.items()):
#     print(k1, v1[2], v2[2])

In [None]:
# APP_EXEC_MAX = {'tclf-gcn': [13.368155479431152, 17.26291275024414, 22.0528507232666], 'tclf-rf': [0.011183061636984348, 0.01432347297668457, 0.0415102019906044], 'tstr-lstm': [0.525345504283905, 0.6639305353164673, 0.7699626684188843], 'kalman-gru': [0.012417145073413849, 0.003949642181396484, 0.08132541179656982], 'iclf-mnet': [102.6425552368164, 105.85325622558594, 108.42182922363281], 'text-bert': [41.83883285522461, 46.11003112792969, 56.48748016357422], 'iclf-efnet': [120.0, 118.8111572265625, 120.0], 'text-tbert': [8.286052703857422, 9.772259712219238, 13.742752075195312], 'iclf-mvit': [81.0706787109375, 88.86546325683594, 95.1012954711914]}
# APP_EXEC_MIN = {'tclf-gcn': [0.04118216037750244, 0.04308032989501953, 0.049125004559755325], 'tclf-rf': [0.005355142056941986, 0.005410194396972656, 0.005952763371169567], 'tstr-lstm': [0.13362009823322296, 0.13778185844421387, 0.1463068723678589], 'kalman-gru': [0.0021471274085342884, 0.0022356510162353516, 0.0026605844032019377], 'iclf-mnet': [0.07093993574380875, 0.06460070610046387, 0.08770449459552765], 'text-bert': [0.16787423193454742, 0.1311308741569519, 0.2003573179244995], 'iclf-efnet': [0.5715664625167847, 0.003205239772796631, 0.44005441665649414], 'text-tbert': [0.011626155115664005, 0.012388646602630615, 0.01636815071105957], 'iclf-mvit': [0.05962417274713516, 0.044938087463378906, 0.06649579852819443]}
# T_IX = 1

# app_max_exec = {a: [] for a in APPLICATIONS_E}

# for d in train_ds:
#     a = d[0]["x_features"][0]
#     a = APPLICATIONS_E[int(a)]
#     ts = d[1][0][T_IX]

#     app_max_exec[a].append(ts.item())

# for a, d in app_max_exec.items():
#     d = np.array(d)

#     d = d / APP_EXEC_MAX[a][T_IX]

#     plt.hist(d, label=a, bins=20)
#     # plt.hist((d - np.mean(d)) / np.std(d), label=a)
#     # plt.hist((d)/ (np.max(d)), label=a)
#     # plt.hist((d - d.min())/ (np.max(d) - d.min()), label=a)

# plt.legend()

## Testing Model Save/Load


In [None]:
# s1 = ServicePredictorNNSetReprHydra(embedding_type=EmbeddingTypes.USE_EMBEDDING_LAYER, application_catalog=ApplicationCatalog.V1)
# s1.save_weights(path=DS_FOLDER)
# print("saved")
# print(s1.embedding.weight.data)

# s2 = ServicePredictorNNSetReprHydra(embedding_type=EmbeddingTypes.USE_EMBEDDING_LAYER, application_catalog=ApplicationCatalog.V2)
# s2.load_weights(path=DS_FOLDER)

## Train


In [None]:
match MODEL:
    case "boot-setrepr":
        service_predictor = ServicePredictorNNSetRepr(
            embedding_type=EmbeddingTypes.USE_EMBEDDING_LAYER
        )
    case "boot-setrepr-hydra":
        service_predictor = ServicePredictorNNSetReprHydra(
            use_base_param=USE_BASE_PARAM,
            embedding_type=EmbeddingTypes.USE_EMBEDDING_LAYER,
            application_catalog=APPLICATION_CATALOG,
            use_gpu=USE_GPU,
        )
        if LOAD_WEIGHTS:
            service_predictor.load_weights(path=LOAD_WEIGHTS_PATH, expand_embedd=True)

        # if LOAD_WEIGHTS and APPLICATION_CATALOG in [
        #     ApplicationCatalog.V21,
        #     ApplicationCatalog.V22,
        # ]:
        #     service_predictor.load_weights(path=DS_FOLDER_V1, expand_embedd=True)
    case "boot-nn":
        service_predictor = ServicePredictorNN()
    case _:
        print("Incorrect model name")


# loading
# service_predictor.load_state_dict(th.load(os.path.join(DS_FOLDER, f"saved_model_{SAVE_POSTFIX}"), weights_only=True))

loss_fn = nn.MSELoss()
optimizer = optim.Adam(service_predictor.parameters())

num_params = sum(p.numel() for p in service_predictor.parameters())
print("num. trainable params", num_params)

In [None]:
if MODEL == "boot-setrepr-hydra":
    sts_val_losses = []
    sts_train_losses = []

    lm_val_losses = []
    lm_train_losses = []

    for epoch in range(NUM_EPOCHS):
        lm_loss_wt = LM_LOSS_WT * 0.9**epoch
        print(f"{epoch=}")
        sts_val_loss = 0
        lm_val_loss = 0

        if epoch % 5 == 0 and not SKIP_VAL:
            with th.no_grad():
                for data in val_dl:
                    x, (y_sts, y_lm) = data

                    out_sts, out_lm = service_predictor(x)

                    loss_sts = loss_fn(out_sts, y_sts)
                    loss_lm = loss_fn(out_lm, y_lm)
                    loss_lm *= lm_loss_wt

                    sts_val_loss += loss_sts.item()
                    lm_val_loss += loss_lm.item()

                sts_val_loss /= len(val_dl)
                lm_val_loss /= len(val_dl)

                sts_val_losses.append(sts_val_loss)
                lm_val_losses.append(lm_val_loss)

            sts_train_loss = 0
            lm_train_loss = 0
            with th.no_grad():
                for data in train_dl:
                    x, (y_sts, y_lm) = data

                    out_sts, out_lm = service_predictor(x)
                    loss_sts = loss_fn(out_sts, y_sts)
                    loss_lm = loss_fn(out_lm, y_lm)
                    loss_lm *= lm_loss_wt

                    sts_train_loss += loss_sts.item()
                    lm_train_loss += loss_lm.item()

                sts_train_loss /= len(train_dl)
                lm_train_loss /= len(train_dl)

                sts_train_losses.append(sts_train_loss)
                lm_train_losses.append(lm_train_loss)

        for data in tqdm(train_dl):
            x, (y_sts, y_lm) = data
            optimizer.zero_grad()

            out_sts, out_lm = service_predictor(x)

            loss_sts = loss_fn(out_sts, y_sts)
            loss_lm = loss_fn(out_lm, y_lm)
            loss_lm *= lm_loss_wt

            loss = loss_sts + loss_lm

            loss.backward()
            th.nn.utils.clip_grad_norm_(service_predictor.parameters(), max_norm=1.0)
            optimizer.step()

    sts_train_losses = np.array(sts_train_losses)
    sts_val_losses = np.array(sts_val_losses)
    lm_train_losses = np.array(lm_train_losses)
    lm_val_losses = np.array(lm_val_losses)

    # if APPLICATION_CATALOG == ApplicationCatalog.V1:
    #     service_predictor.save_weights(path=DS_FOLDER_V1)
    if SAVE_RES:
        service_predictor.save_weights(path=SAVE_PATH)
    # th.save(service_predictor.state_dict(), os.path.join(DS_FOLDER, f"saved_model_{SAVE_POSTFIX}"))

    plt.figure()

    plt.plot(list(range(1, NUM_EPOCHS + 1, 5)), sts_train_losses, "red", label="Training")
    if len(sts_val_losses) > 0:
        plt.plot(list(range(1, NUM_EPOCHS + 1, 5)), sts_val_losses, "blue", label="Validation")

    plt.legend()
    plt.grid()
    plt.xlabel(r"\textbf{Epochs}")
    plt.ylabel(r"$\log$ \textbf{MSE}")
    plt.yscale("log")

    if SAVE_FIGS:
        plt.savefig(os.path.join(FIGS_PATH, f"training_progress_{SAVE_POSTFIX}.pdf"))

    plt.show()

    plt.figure()

    plt.plot(list(range(1, NUM_EPOCHS + 1, 5)), lm_train_losses, "red", label="Training")
    if len(lm_val_losses) > 0:
        plt.plot(list(range(1, NUM_EPOCHS + 1, 5)), lm_val_losses, "blue", label="Validation")

    plt.legend()
    plt.grid()
    plt.xlabel(r"\textbf{Epochs}")
    plt.ylabel(r"$\log$ \textbf{MSE}")
    plt.yscale("log")

    if SAVE_FIGS:
        plt.savefig(os.path.join(FIGS_PATH, f"training_progress_{SAVE_POSTFIX}.pdf"))

    plt.show()

training_progress = {
    "train_losses": {"sts": sts_train_losses.tolist(), "lm": lm_train_losses.tolist()},
    "val_losses": {"sts": sts_val_losses.tolist(), "lm": lm_val_losses.tolist()},
}

if SAVE_RES:
    json.dump(
        training_progress,
        open(os.path.join(SAVE_PATH, f"training_progress_{SAVE_POSTFIX}.json"), "w"),
        indent=4,
    )

In [None]:
# service_predictor.load_weights(path=SAVE_PATH, expand_embedd=False)

In [None]:
stats = json.load(open("models/sts_server1_gpu.json"))

In [None]:
def evaluate(val_dl, target_ix=0):
    errors = []
    errors_a = []
    if MODEL == "boot-setrepr-hydra":
        with th.no_grad():
            for batch in tqdm(val_dl):
                x, (y_sts, y_lm) = batch
                y_pred_sts, y_pred_lm = service_predictor(x)
                y_pred_sts = y_pred_sts.detach()

                y_sts = y_sts[:, target_ix]
                y_pred_sts = y_pred_sts[:, target_ix]

                mask = y_sts != 0
                y_sts = y_sts[mask]
                y_pred_sts = y_pred_sts[mask]


                errors.extend(((y_pred_sts - y_sts) / y_sts)[:].numpy().tolist())


    if MODEL == "boot-setrepr" or MODEL == "boot-nn":
        with th.no_grad():
            for batch in tqdm(val_dl):
                x, y = batch
                y_pred = service_predictor(x)

                errors.extend(((y_pred - y) / y)[:, 0].numpy().tolist())

    return errors


def plot_errors(errors):
    plt.figure()
    x_vals1, kde_vals1 = get_kde(errors)
    plt.fill_between(x_vals1, kde_vals1, color="crimson")
    # plt.hist(errors, bins=100, density=True)

    return x_vals1, kde_vals1


target_ix = 1
errors = evaluate(val_dl, target_ix=target_ix)
errors = np.array(errors)
errors = errors[np.isfinite(errors)]
errors = errors.tolist()
x_vals, kde_vals = plot_errors(errors)

if SAVE_RES:
    json.dump(
        {"errors": errors},
        open(
            os.path.join(SAVE_PATH, f"errors_{SAVE_POSTFIX}{SAVE_RES_PF}_t{target_ix}.json"), "w"
        ),
        indent=4,
    )