## Example of training/testing using deterministic models with pytorch-lightning

In [2]:
from pathlib import Path
from types import SimpleNamespace
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: # To be able to import files 
    sys.path.append(module_path) # without installing the project

from bayesrul.ncmapss.dataset import NCMAPSSDataModule
from bayesrul.ncmapss.models import NCMAPSSModel, get_checkpoint, TBLogger

In [3]:
args = SimpleNamespace(
    data_path="../data/ncmapss/",
    out_path="../results/ncmapss/",
    scn="dnn_ptl",
    net="linear",
    lr=1e-3
)
checkpoint_dir = Path(f"{args.out_path}/{args.scn}/checkpoints/{args.net}2")
logger = TBLogger(
    f"{args.out_path}/{args.scn}/lightning_logs/{args.net}",
    default_hp_metric=False,
)

In [4]:
# Just to illustrate. To properly train see scripts/cmapss_training.py
data = NCMAPSSDataModule(args.data_path, batch_size=10000)
dnn = NCMAPSSModel(data.win_length, data.n_features, args.net)

checkpoint_file = get_checkpoint(checkpoint_dir)
monitor = "loss/val"
checkpoint_callback = ModelCheckpoint(dirpath=checkpoint_dir, monitor=monitor)
earlystopping_callback = EarlyStopping(monitor=monitor, patience=10)

trainer = pl.Trainer(
    gpus=[0],
    max_epochs=1000,
    log_every_n_steps=2,
    logger=logger,
    callbacks=[
        checkpoint_callback,
        earlystopping_callback,
    ],
)
trainer.fit(dnn, data, ckpt_path=checkpoint_file)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: ../results/ncmapss//dnn_ptl/lightning_logs/linear/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name | Type   | Params
--------------------------------
0 | net  | Linear | 125 K 
--------------------------------
125 K     Trainable params
0         Non-trainable params
125 K     Total params
0.502     Total estimated model params size (MB)


Epoch 88: 100%|██████████| 7/7 [01:38<00:00, 14.06s/it, loss=4.64, v_num=0]   


In [5]:
data = NCMAPSSDataModule(args.data_path, batch_size=1000)
dnn = NCMAPSSModel.load_from_checkpoint(get_checkpoint(checkpoint_dir))
trainer = pl.Trainer(gpus=[0], log_every_n_steps=10, logger=logger)
trainer.test(dnn, data, verbose=False)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Testing DataLoader 0: 100%|██████████| 13/13 [00:00<00:00, 41.00it/s]


[{'loss/test': 211.42413330078125}]

In [6]:
dl = data.train_dataloader()
print(f"Win length {data.win_length}, n_features {data.n_features}")
ex = next(iter(dl))

Win length 25, n_features 42


In [7]:
ex

[tensor([[[ 0.5287,  0.4701,  0.5888,  ..., 25.0000,  3.0000,  0.0000],
          [ 0.5341,  0.4772,  0.5940,  ..., 25.0000,  3.0000,  0.0000],
          [ 0.5355,  0.4784,  0.5939,  ..., 25.0000,  3.0000,  0.0000],
          ...,
          [ 0.6067,  0.5630,  0.6579,  ..., 25.0000,  3.0000,  0.0000],
          [ 0.6013,  0.5542,  0.6502,  ..., 25.0000,  3.0000,  0.0000],
          [ 0.5974,  0.5470,  0.6438,  ..., 25.0000,  3.0000,  0.0000]],
 
         [[ 0.5890,  0.6399,  0.7546,  ..., 46.0000,  3.0000,  0.0000],
          [ 0.5958,  0.6488,  0.7605,  ..., 46.0000,  3.0000,  0.0000],
          [ 0.6011,  0.6560,  0.7675,  ..., 46.0000,  3.0000,  0.0000],
          ...,
          [ 0.5894,  0.6369,  0.7503,  ..., 46.0000,  3.0000,  0.0000],
          [ 0.5890,  0.6367,  0.7502,  ..., 46.0000,  3.0000,  0.0000],
          [ 0.5884,  0.6364,  0.7500,  ..., 46.0000,  3.0000,  0.0000]],
 
         [[ 0.4387,  0.5465,  0.7149,  ..., 53.0000,  3.0000,  0.0000],
          [ 0.4408,  0.5476,