# Using Experiment Tracking Tools in LightningTrainer

- `W&B`, `CometML`, `MLFlow`, and `Tensorboard` are all popular tools in the field of machine learning for managing, visualizing, and tracking experiments. 

The `LightningTrainer` integration in Ray AIR allows you to continue using these built-in experiment tracking integrations.

## Define your model and dataloader

In [1]:
import os
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import TensorDataset, DataLoader

# create dummy data
X = torch.randn(128, 3)  # 128 samples, 3 features
y = torch.randint(0, 2, (128,))  # 128 binary labels

# create a TensorDataset to wrap the data
dataset = TensorDataset(X, y)

# create a DataLoader to iterate over the dataset
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [2]:
# Define a dummy model
class DummyModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = torch.nn.Linear(3, 1)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.binary_cross_entropy_with_logits(y_hat.flatten(), y.float())

        # The metrics below will be reported to Loggers
        self.log("train_loss", loss)
        self.log_dict({"metric_1": 1 / (batch_idx + 1), "metric_2": batch_idx * 100})
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

## Define your loggers

In [3]:
from pytorch_lightning.loggers.wandb import WandbLogger
from pytorch_lightning.loggers.comet import CometLogger
from pytorch_lightning.loggers.mlflow import MLFlowLogger
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from pytorch_lightning.utilities.rank_zero import rank_zero_only # Avoid creating a new experiment run on the driver node.
import wandb


# A callback to login wandb in each worker
class WandbLoginCallback(pl.Callback):
    def __init__(self, key):
        self.key = key

    def setup(self, trainer, pl_module, stage) -> None:
        wandb.login(key=self.key)


def create_loggers(name, project_name, save_dir="./logs", offline=False):
    # Avoid creating a new experiment run on the driver node.
    rank_zero_only.rank = None

    # Wandb
    # wandb_api_key = os.environ.get("WANDB_API_KEY", None)

    # class RayWandbLogger(WandbLogger):
    #     # wandb.finish() ensures all artifacts get uploaded at the end of training.
    #     def finalize(self, status):
    #         super().finalize(status)
    #         wandb.finish()

    # wandb_logger = RayWandbLogger(
    #     name=name, 
    #     project=project_name, 
    #     # Specify a unique id to avoid reporting to a new run after restoration
    #     id="unique_id", 
    #     save_dir=f"{save_dir}/wandb", 
    #     offline=offline
    # )
    # callbacks = [] if offline else [WandbLoginCallback(key=wandb_api_key)]

    # # CometML
    # comet_api_key = os.environ.get("COMET_API_KEY", None)
    # comet_logger = CometLogger(
    #     api_key=comet_api_key,
    #     experiment_name=name,
    #     project_name=project_name,
    #     save_dir=f"{save_dir}/comet",
    #     offline=offline,
    # )

    # MLFlow
    mlflow_logger = MLFlowLogger(
        run_name=name,
        experiment_name=project_name,
        tracking_uri=f"file:{save_dir}/mlflow",
    )

    # Tensorboard
    tensorboard_logger = TensorBoardLogger(
        name=name, save_dir=f"{save_dir}/tensorboard"
    )

    # return [wandb_logger, comet_logger, mlflow_logger, tensorboard_logger], callbacks
    return [mlflow_logger, tensorboard_logger]

In [4]:
YOUR_SAVE_DIR = "./logs"
# loggers, callbacks = create_loggers(
#     name="demo-run", project_name="demo-project", save_dir=YOUR_SAVE_DIR, offline=False
# )
loggers = create_loggers(
    name="demo-run", project_name="demo-project", save_dir=YOUR_SAVE_DIR, offline=False
)

## Train the model and view logged results

In [6]:
from ray.air.config import RunConfig, ScalingConfig
from ray.train.lightning import LightningConfigBuilder, LightningTrainer

builder = LightningConfigBuilder()
builder.module(cls=DummyModel)
builder.trainer(
    max_epochs=5,
    accelerator="cpu",
    logger=loggers,
    log_every_n_steps=1,
)
builder.fit_params(train_dataloaders=dataloader)

lightning_config = builder.build()

scaling_config = ScalingConfig(num_workers=4, use_gpu=False)

run_config = RunConfig(
    name="ptl-exp-tracking",
    storage_path="/tmp/ray_results",
)

trainer = LightningTrainer(
    lightning_config=lightning_config,
    scaling_config=scaling_config,
    run_config=run_config,
)

trainer.fit()

0,1
Current time:,2023-09-07 17:54:19
Running for:,00:00:10.18
Memory:,7.3/30.9 GiB

Trial name,status,loc,iter,total time (s),train_loss,metric_1,metric_2
LightningTrainer_df06c_00000,TERMINATED,192.168.33.188:15261,5,5.1021,0.804872,0.25,300


2023-09-07 17:54:08,877	INFO data_parallel_trainer.py:404 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[2m[36m(TrainTrainable pid=15261)[0m GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[2m[36m(LightningTrainer pid=15261)[0m GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[2m[36m(LightningTrainer pid=15261)[0m Starting distributed worker processes: ['15402 (192.168.33.188)', '15403 (192.168.33.188)', '15404 (192.168.33.188)', '15405 (192.168.33.188)']
[2m[36m(RayTrainWorker pid=15402)[0m Setting up process group for: env:// [rank=0, world_size=4]
[2m[36m(RayTrainWorker pid=15402)[0m GPU available: 

Epoch 0: 100%|██████████| 4/4 [00:00<00:00, 126.18it/s, v_num=a8_0]
Epoch 1: 100%|██████████| 4/4 [00:00<00:00, 172.09it/s, v_num=a8_0]


[2m[36m(RayTrainWorker pid=15402)[0m 
[2m[36m(RayTrainWorker pid=15402)[0m   | Name  | Type   | Params
[2m[36m(RayTrainWorker pid=15402)[0m ---------------------------------
[2m[36m(RayTrainWorker pid=15402)[0m 0 | layer | Linear | 4     
[2m[36m(RayTrainWorker pid=15402)[0m ---------------------------------
[2m[36m(RayTrainWorker pid=15402)[0m 4         Trainable params
[2m[36m(RayTrainWorker pid=15402)[0m 0         Non-trainable params
[2m[36m(RayTrainWorker pid=15402)[0m 4         Total params
[2m[36m(RayTrainWorker pid=15402)[0m 0.000     Total estimated model params size (MB)
[2m[36m(RayTrainWorker pid=15402)[0m   rank_zero_warn(


Epoch 2:   0%|          | 0/4 [00:00<?, ?it/s, v_num=a8_0]         
Epoch 2: 100%|██████████| 4/4 [00:00<00:00, 127.38it/s, v_num=a8_0]
Epoch 3: 100%|██████████| 4/4 [00:00<00:00, 120.59it/s, v_num=a8_0]
Epoch 4: 100%|██████████| 4/4 [00:00<00:00, 238.28it/s, v_num=a8_0]
Epoch 4: 100%|██████████| 4/4 [00:00<00:00, 28.23it/s, v_num=a8_0] 


[2m[36m(RayTrainWorker pid=15402)[0m `Trainer.fit` stopped: `max_epochs=5` reached.
2023-09-07 17:54:19,059	INFO tune.py:1148 -- Total run time: 10.20 seconds (10.18 seconds for the tuning loop).


Result(
  metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.8048718571662903, 'metric_1': 0.25, 'metric_2': 300.0, 'epoch': 4, 'step': 20, 'should_checkpoint': True, 'done': True, 'trial_id': 'df06c_00000', 'experiment_tag': '0'},
  path='/tmp/ray_results/ptl-exp-tracking/LightningTrainer_df06c_00000_0_2023-09-07_17-54-08',
  checkpoint=LightningCheckpoint(local_path=/tmp/ray_results/ptl-exp-tracking/LightningTrainer_df06c_00000_0_2023-09-07_17-54-08/checkpoint_000004)
)