# Training the Informer with 2023 data

Following the "informer.py" script

In [15]:
import json
import sys
import os
import torch
import numpy as np

#!pip install bigtree
import site
site.getusersitepackages()

sys.path.append(site.getusersitepackages())
#export PYTHONPATH=$PYTHONPATH:/eos/home-i01/j/jhoya/.local/lib/python3.9/site-packages
sys.path.append('/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/transformer_based_detection/informers/')
from exp.exp_informer import ExpInformer

In [16]:
import argparse

args_dict = {
    "use_gpu": False,
    "use_multi_gpu": False,
    "seed":42, 
    "data": "HLT_DCM_2023",
    "features": "M",
    "target": "OT",
    "freq": "s",
    "checkpoints": "./checkpoints",
    "seq_len": 16,
    "label_len": 8,
    "pred_len": 1,
    "enc_in": 146,
    "dec_in": 146,
    "c_out": 146,
    "d_model": 256, # before was 576
    "n_heads": 2, # before was 4
    "e_layers": 1,
    "d_layers": 4,
    "d_ff": 1024, # before was 2944
    "factor": 1,
    "padding": 1,
    "dropout": 0.001,
    "attn": "prob",
    "embed": "timeF",
    "activation": "gelu",
    "output_attention": True,
    "no_distil": True,
    "no_mix": True,
    "lradj": "type1",
    "use_amp": False,
    "inverse": False,
    "loss": "MSE",
    "learning_rate": 0.00009727998365755187,
    "num_workers": 8, # Before was 0
    "train_epochs": 2,
    "batch_size": 256, # Before was 128
    "patience": 3,
    "apply_augmentations": False,
    #"augmentations": ['Scale:0.8,1.0', 'Scale_APP:0.8,1.0,0.01,0.05,0.05'],
    "augmentations": [],
    "augmented_dataset_size_relative": 1.0,
    "augmented_data_ratio": 0.25
}

# Convert the dictionary to an argparse.Namespace object
args = argparse.Namespace(**args_dict)


In [17]:
Exp = ExpInformer

In [18]:
setting = f'{args.data.lower()}_{args.loss.lower()}_seed_{int(args.seed)}'
setting

'hlt_dcm_2023_mse_seed_42'

In [19]:
# Set experiments
exp = Exp(args)

Use CPU


In [20]:
#exp.train(setting)
#exp.test(setting)

In [21]:
import os
import time
import warnings
import json
from collections import defaultdict
from functools import partial, partialmethod

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
#from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
#from torchinfo import summary
from bigtree import Node, tree_to_dataframe, tree_to_dot
from bigtree.tree.search import find_child_by_name
from tqdm import tqdm

from dataset_loaders.omni_anomaly_dataset import OmniAnomalyDataset
from dataset_loaders.hlt_datasets import HLTDataset
from dataset_loaders.eclipse_datasets import EclipseDataset
from exp.exp_basic import ExpBasic
from models.model import Informer
from models.sad_like_loss import *
from utils.tools import EarlyStopping, adjust_learning_rate

In [22]:
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
train_data, train_loader = exp._get_data(flag='train')
vali_data, vali_loader = exp._get_data(flag='val')
path = os.path.join(exp.args.checkpoints, setting)

In [23]:
path

'./checkpoints/hlt_dcm_2023_mse_seed_42'

In [24]:
if not os.path.exists(path):
    os.makedirs(path)
train_data.pickle_scaler(f'{path}/scaler.pkl')
if exp.args.loss == 'SMSE':
    labeled_train_data, labeled_train_loader =\
                    exp._get_data(flag='labeled_train')
train_steps_unlabeled = len(train_loader)
train_steps_labeled = len(labeled_train_loader)\
            if exp.args.loss == 'SMSE' else 0
delta = -1 if exp.args.loss == 'SMSE' else 0
early_stopping = EarlyStopping(patience=exp.args.patience,
                                                verbose=True,
                                                delta=delta)

In [25]:
n_cores = os.cpu_count()
print(f"Number of CPU cores: {n_cores}")
# Set the number of threads for PyTorch
import torch
#n_threads = os.cpu_count()  # Use all available CPU cores
#n_threads = max(1, os.cpu_count() // 2)
n_threads = 16
print(n_threads)
torch.set_num_threads(n_threads) 

Number of CPU cores: 28
16


In [29]:

model_optim = exp._select_optimizer()
criterion = exp._select_criterion(exp.args.loss)
if exp.args.use_amp:
    scaler = torch.cuda.amp.GradScaler()
summary_writer = SummaryWriter()


# Switch to Lower Precision: Use torch.float16 instead of torch.float32 
exp.model = exp.model.to(torch.float32)

In [None]:
for epoch in range(exp.args.train_epochs):
    train_loss = []
    preds_all = []
    y_actual_all =[]
        
    exp.model.train() #set the model to training mode
        
    epoch_time = time.time()
    #print(epoch_time)
        


    for batch_index, (batch_x,\
                        batch_y,\
                        batch_x_mark,\
                        batch_y_mark) in enumerate(tqdm(train_loader)):
                
        model_optim.zero_grad()
        if exp.args.output_attention:
            #print(train_data)
            #print(batch_x)
            #print(batch_y)
            #print(batch_x_mark)
            #print(batch_y_mark)
            pred, true, _ = exp._process_one_batch(train_data,
                                                        batch_x,
                                                        batch_y,
                                                        batch_x_mark,
                                                        batch_y_mark)
        else:
            pred, true = exp._process_one_batch(train_data,
                                                    batch_x,
                                                    batch_y,
                                                    batch_x_mark,
                                                    batch_y_mark)
        #print(pred)
        preds_all.append(pred.detach().cpu().numpy())
        y_actual_all.append(true.detach().cpu().numpy())
        loss = criterion(pred, true)
        train_loss.append(loss.item())
        #print(preds_all)
        #print(y_actual_all)
        #print(train_loss)
        
        summary_writer.add_scalar("Train loss",
                                    loss,
                                    batch_index + epoch*\
                                        train_steps_unlabeled)
                 
        if exp.args.use_amp:
            scaler.scale(loss).backward()
            scaler.step(model_optim)
            scaler.update()
        else:
            loss.backward()
            model_optim.step()
            
    print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))    
    train_loss = np.average(train_loss)
    vali_loss = exp.vali(vali_data,
                            vali_loader,
                            criterion)
    preds_all = early_stopping(vali_loss,
                                exp.model,
                                preds_all,
                                path)
    if early_stopping.early_stop:
        print("Early stopping")
        break
    adjust_learning_rate(model_optim, epoch + 1, exp.args)
    summary_writer.add_scalar("Validation loss", vali_loss, epoch)
    log_gradients_in_model(exp.model,
                            summary_writer,
                            epoch)

In [None]:
preds_all_np = np.array(preds_all)
y_actual_all = np.array(y_actual_all)
preds_all_np = preds_all_np.reshape(-1, preds_all_np.shape[-2], preds_all_np.shape[-1])
y_actual_all = y_actual_all.reshape(-1, y_actual_all.shape[-2], y_actual_all.shape[-1])

# Save results
folder_path = './results/' + setting +'/'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

np.save(folder_path + 'preds_all_train.npy', preds_all_np)
np.save(folder_path + 'true_values_all_train.npy', y_actual_all)
best_model_path = path + '/checkpoint_informer.pth'
exp.model.load_state_dict(torch.load(best_model_path))
exp.model    