In [6]:
%pip install "ray[tune]" --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import open3d.ml.torch as ml3d

inp_positions = torch.randn([20,3])
inp_features = torch.randn([20,8])
out_positions = torch.randn([10,3])

conv = ml3d.layers.ContinuousConv(in_channels=8, filters=16, kernel_size=[3,3,3])
out_features = conv(inp_features, inp_positions, out_positions, extents=2.0)

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [3]:
from ray import tune
"""

def objective(config):  # ①
    score = config["a"] ** 2 + config["b"]
    return {"score": score}


search_space = {  # ②
    "a": tune.grid_search([0.001, 0.01, 0.1, 1.0]),
    "b": tune.choice([1, 2, 3]),
}

tuner = tune.Tuner(objective, param_space=search_space)  # ③

results = tuner.fit()
print(results.get_best_result(metric="score", mode="min").config)
"""

'\n\ndef objective(config):  # ①\n    score = config["a"] ** 2 + config["b"]\n    return {"score": score}\n\n\nsearch_space = {  # ②\n    "a": tune.grid_search([0.001, 0.01, 0.1, 1.0]),\n    "b": tune.choice([1, 2, 3]),\n}\n\ntuner = tune.Tuner(objective, param_space=search_space)  # ③\n\nresults = tuner.fit()\nprint(results.get_best_result(metric="score", mode="min").config)\n'

https://docs.ray.io/en/latest/tune/examples/tune-pytorch-lightning.html#tune-pytorch-lightning-ref

In [27]:
import os
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from filelock import FileLock
from torchmetrics import Accuracy
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms

from ray.train.lightning import LightningTrainer, LightningConfigBuilder

In [28]:
class MNISTClassifier(pl.LightningModule):
    def __init__(self, config):
        super(MNISTClassifier, self).__init__()
        self.accuracy = Accuracy(task="multiclass", num_classes=10)
        self.layer_1_size = config["layer_1_size"]
        self.layer_2_size = config["layer_2_size"]
        self.lr = config["lr"]

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
        self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
        self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)

        self.outputs = []

    def cross_entropy_loss(self, logits, labels):
        return F.nll_loss(logits, labels)

    def forward(self, x):
        batch_size, channels, width, height = x.size()
        x = x.view(batch_size, -1)

        x = self.layer_1(x)
        x = torch.relu(x)

        x = self.layer_2(x)
        x = torch.relu(x)

        x = self.layer_3(x)
        x = torch.log_softmax(x, dim=1)

        return x

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        self.log("ptl/train_loss", loss)
        self.log("ptl/train_accuracy", accuracy)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)
        self.outputs.append({"val_loss": loss, "val_accuracy": accuracy})
        return {"val_loss": loss, "val_accuracy": accuracy}

    def on_validation_epoch_end(self):
        avg_loss = torch.stack([x["val_loss"] for x in self.outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in self.ouputs]).mean()
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_accuracy", avg_acc)
        self.outputs = []

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer


class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, batch_size=128):
        super().__init__()
        self.data_dir = os.getcwd()
        self.batch_size = batch_size
        self.transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        )

    def setup(self, stage=None):
        with FileLock(f"{self.data_dir}.lock"):
            mnist = MNIST(
                self.data_dir, train=True, download=True, transform=self.transform
            )
            self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])

            self.mnist_test = MNIST(
                self.data_dir, train=False, download=True, transform=self.transform
            )

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)

In [29]:
default_config = {
    "layer_1_size": 128,
    "layer_2_size": 256,
    "lr": 1e-3,
}

In [30]:
from pytorch_lightning.loggers import TensorBoardLogger
from ray import air, tune
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining

Frequency of RayTune metric reportint == Frequency of Lightning Checkpoints!

In [31]:
# The maximum training epochs
num_epochs = 5

# Number of sampls from parameter space
num_samples = 10

accelerator = "gpu"

config = {
    "layer_1_size": tune.choice([32, 64, 128]),
    "layer_2_size": tune.choice([64, 128, 256]),
    "lr": tune.loguniform(1e-4, 1e-1),
}

In [32]:
dm = MNISTDataModule(batch_size=64)
logger = TensorBoardLogger(save_dir=os.getcwd(), name="tune-ptl-example", version=".")

lightning_config = (
    LightningConfigBuilder()
    .module(cls=MNISTClassifier, config=config)
    .trainer(max_epochs=num_epochs, accelerator=accelerator, logger=logger)
    .fit_params(datamodule=dm)
    .checkpointing(monitor="ptl/val_accuracy", save_top_k=2, mode="max")
    .build()
)

# Make sure to also define an AIR CheckpointConfig here
# to properly save checkpoints in AIR format.
run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="ptl/val_accuracy",
        checkpoint_score_order="max",
    ),
)

In [33]:
lightning_config

{'_module_class': __main__.MNISTClassifier,
 '_module_init_config': {'config': {'layer_1_size': <ray.tune.search.sample.Categorical at 0x7f87aa07fc40>,
   'layer_2_size': <ray.tune.search.sample.Categorical at 0x7f87aa07c310>,
   'lr': <ray.tune.search.sample.Float at 0x7f87aa07df00>}},
 '_trainer_init_config': {'max_epochs': 5,
  'accelerator': 'gpu',
  'logger': <pytorch_lightning.loggers.tensorboard.TensorBoardLogger at 0x7f87aa07f430>},
 '_trainer_fit_params': {'datamodule': <__main__.MNISTDataModule at 0x7f87aa07c4c0>},
 '_strategy_config': {},
 '_model_checkpoint_config': {'monitor': 'ptl/val_accuracy',
  'save_top_k': 2,
  'mode': 'max'}}

In [10]:
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)
scaling_config = ScalingConfig(
    num_workers=1, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 1}
)
# Define a base LightningTrainer without hyper-parameters for Tuner
lightning_trainer = LightningTrainer(
    scaling_config=scaling_config,
    run_config=run_config,
)

In [11]:
import ray
print("cuda:", torch.cuda.is_available())
ray.init(num_gpus=1)
print("ray:", ray.cluster_resources())

cuda: True


2023-07-02 09:42:31,812	INFO worker.py:1636 -- Started a local Ray instance.


ray: {'GPU': 1.0, 'node:192.168.195.135': 1.0, 'CPU': 8.0, 'memory': 14585027790.0, 'object_store_memory': 7292513894.0}


In [12]:
dm.setup()

In [13]:
test = MNISTClassifier(default_config)

In [14]:
def tune_mnist_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        lightning_trainer,
        param_space={"lightning_config": lightning_config},
        tune_config=tune.TuneConfig(
            metric="ptl/val_accuracy",
            mode="max",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
        run_config=air.RunConfig(
            name="tune_mnist_asha",
        ),
    )
    results = tuner.fit()
    best_result = results.get_best_result(metric="ptl/val_accuracy", mode="max")
    best_result


tune_mnist_asha(num_samples=num_samples)

2023-07-02 09:42:33,597	INFO tuner_internal.py:490 -- A `RunConfig` was passed to both the `Tuner` and the `LightningTrainer`. The run config passed to the `Tuner` is the one that will be used.
  tuner = tune.Tuner(


0,1
Current time:,2023-07-02 09:43:53
Running for:,00:01:19.63
Memory:,4.6/25.0 GiB

Trial name,# failures,error file
LightningTrainer_01be9_00000,1,"/home/jakob/ray_results/tune_mnist_asha/LightningTrainer_01be9_00000_0_layer_1_size=32,layer_2_size=64,lr=0.0003_2023-07-02_09-42-33/error.txt"
LightningTrainer_01be9_00001,1,"/home/jakob/ray_results/tune_mnist_asha/LightningTrainer_01be9_00001_1_layer_1_size=32,layer_2_size=128,lr=0.0836_2023-07-02_09-42-33/error.txt"
LightningTrainer_01be9_00002,1,"/home/jakob/ray_results/tune_mnist_asha/LightningTrainer_01be9_00002_2_layer_1_size=128,layer_2_size=64,lr=0.0002_2023-07-02_09-42-33/error.txt"
LightningTrainer_01be9_00003,1,"/home/jakob/ray_results/tune_mnist_asha/LightningTrainer_01be9_00003_3_layer_1_size=128,layer_2_size=128,lr=0.0538_2023-07-02_09-42-34/error.txt"

Trial name,status,loc,...odule_init_config /config/layer_1_size,...odule_init_config /config/layer_2_size,..._config/_module_i nit_config/config/lr
LightningTrainer_01be9_00004,RUNNING,192.168.195.135:20290,128,128,0.00135028
LightningTrainer_01be9_00005,PENDING,,128,64,0.00465507
LightningTrainer_01be9_00006,PENDING,,64,256,0.000237745
LightningTrainer_01be9_00007,PENDING,,128,64,0.000138127
LightningTrainer_01be9_00008,PENDING,,64,128,0.00626562
LightningTrainer_01be9_00009,PENDING,,64,256,0.0301398
LightningTrainer_01be9_00000,ERROR,192.168.195.135:18899,32,64,0.000255166
LightningTrainer_01be9_00001,ERROR,192.168.195.135:19257,32,128,0.0836217
LightningTrainer_01be9_00002,ERROR,192.168.195.135:19606,128,64,0.00021693
LightningTrainer_01be9_00003,ERROR,192.168.195.135:19948,128,128,0.053764


[2m[36m(LightningTrainer pid=18899)[0m 2023-07-02 09:42:48,163	INFO backend_executor.py:137 -- Starting distributed worker processes: ['18958 (192.168.195.135)']
[2m[36m(RayTrainWorker pid=18958)[0m 2023-07-02 09:42:49,473	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=18958)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=18958)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=18958)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=18958)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=18958)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=18958)[0m 
[2m[36m(RayTrainWorker pid=18958)[0m   | Name     | Type               | Params
[2m[36m(RayTrainWorker pid=18958)[0m ------------------------------------------------
[2m[36m(RayTrainWorker pid=18958)[0m 0 | accuracy | MulticlassAccuracy | 0     


Sanity Checking: 0it [00:00, ?it/s][0m 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  2.96it/s]


2023-07-02 09:42:53,807	ERROR tune_controller.py:873 -- Trial task failed for trial LightningTrainer_01be9_00000
Traceback (most recent call last):
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 18, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/_private/worker.py", line 2540, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): [36mray::_Inner.train()[39m (pid=18899, ip=192.168.195.135, actor_id=6c8ccfdb228077fa060ef3e501000000, repr=LightningTrainer)
  File "/home/j

Trial name,date,hostname,node_ip,pid,timestamp,trial_id
LightningTrainer_01be9_00000,2023-07-02_09-42-46,Jakob-PC-N,192.168.195.135,18899,1688283766,01be9_00000
LightningTrainer_01be9_00001,2023-07-02_09-42-57,Jakob-PC-N,192.168.195.135,19257,1688283777,01be9_00001
LightningTrainer_01be9_00002,2023-07-02_09-43-11,Jakob-PC-N,192.168.195.135,19606,1688283791,01be9_00002
LightningTrainer_01be9_00003,2023-07-02_09-43-24,Jakob-PC-N,192.168.195.135,19948,1688283804,01be9_00003


[2m[36m(LightningTrainer pid=19257)[0m 2023-07-02 09:42:59,920	INFO backend_executor.py:137 -- Starting distributed worker processes: ['19303 (192.168.195.135)']
[2m[36m(RayTrainWorker pid=19303)[0m 2023-07-02 09:43:01,514	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=19303)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=19303)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=19303)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=19303)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=19303)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=19303)[0m 
[2m[36m(RayTrainWorker pid=19303)[0m   | Name     | Type               | Params
[2m[36m(RayTrainWorker pid=19303)[0m ------------------------------------------------
[2m[36m(RayTrainWorker pid=19303)[0m 0 | accuracy | MulticlassAccuracy | 0     


Sanity Checking: 0it [00:00, ?it/s][0m 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(LightningTrainer pid=19606)[0m 2023-07-02 09:43:13,178	INFO backend_executor.py:137 -- Starting distributed worker processes: ['19651 (192.168.195.135)']
[2m[36m(RayTrainWorker pid=19651)[0m 2023-07-02 09:43:14,426	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=19651)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=19651)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=19651)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=19651)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=19651)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=19651)[0m 
[2m[36m(RayTrainWorker pid=19651)[0m   | Name     | Type               | Params
[2m[36m(RayTrainWorker pid=19651)[0m ------------------------------------------------
[2m[36m(RayTrainWorker pid=19651)[0m 0 | accuracy | MulticlassAccuracy | 0     


Sanity Checking: 0it [00:00, ?it/s][0m 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


Sanity Checking: 0it [00:00, ?it/s][0m 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


Sanity Checking: 0it [00:00, ?it/s][0m 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  2.36it/s]


2023-07-02 09:43:51,181	ERROR worker.py:408 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::_Inner.train()[39m (pid=20290, ip=192.168.195.135, actor_id=69378a49250e1d9e0ee9cf9a01000000, repr=LightningTrainer)
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 389, in train
    raise skipped from exception_cause(skipped)
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/train/_internal/utils.py", line 54, in check_for_failure
    ray.get(object_ref)
ray.exceptions.RayTaskError(AttributeError): [36mray::_RayTrainWorker__execute.get_next()[39m (pid=20335, ip=192.168.195.135, actor_id=34a732ca4f59bf3f92b0447101000000, repr=<ray.train._internal.worker_group.RayTrainWorker object at 0x7ff598807c40>)
  File "/home/jakob/anaconda3/envs/o3d_test2/lib/python3.10/site-packages/ray/train/_internal/worker_group.py", line 32, in __execute
    raise skipped from exception_cause(s

RuntimeError: No best trial found for the given metric: ptl/val_accuracy. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.

In [2]:
def enable_parent_folder_import():
    import sys, os
    sys.path.insert(1, os.path.join(sys.path[0], '..'))

enable_parent_folder_import()


In [19]:
%cd ..

/home/jsem/Bachelorarbeit/GNNDensityGradients


In [16]:
sys.path

['/home/jsem/miniconda3/envs/CConv/lib/python3.10/site-packages/ray/thirdparty_files',
 '/home/jsem/Bachelorarbeit/GNNDensityGradients/notebooks',
 '/home/jsem/Bachelorarbeit/GNNDensityGradients/notebooks/..',
 '/home/jsem/miniconda3/envs/CConv/lib/python310.zip',
 '/home/jsem/miniconda3/envs/CConv/lib/python3.10',
 '/home/jsem/miniconda3/envs/CConv/lib/python3.10/lib-dynload',
 '',
 '/home/jsem/miniconda3/envs/CConv/lib/python3.10/site-packages',
 '/tmp/tmpe435t_3b']

In [22]:
from importlib import reload
import datasets.density_data_module
from datasets.density_data_module import *
reload(datasets.density_data_module); from datasets.density_data_module import *

import datasets.density_dataset
from datasets.density_dataset import *
reload(datasets.density_dataset); from datasets.density_dataset import *

density_data = DensityDataModule(
    target = "temporal_density_gradient",
    data_dir = 'datasets/data/dpi_dam_break/train',
    batch_size = 10,
    data_split = (0.7, 0.15, 0.15),
    num_workers = 0, # Note that cuda only allows 0 workers.
    shuffle = False,
    cache = False, # Load dataset into memory
    device = 'cuda',
)
# DO NOT SETUP DATA!
# density_data.setup("fit")
#train_loader = density_data.train_dataloader()
#train_iter = iter(train_loader)
#batch = next(train_iter)
#sample = batch[0]

In [23]:
import pytorch_lightning as pl
from utils.train_helper import *
from models.cconv import CConvModel
from datasets.density_data_module import DensityDataModule
from pytorch_lightning.loggers import TensorBoardLogger
from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray import air, tune, init
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from ray.tune.schedulers import ASHAScheduler
import torch.cuda


hparams = {
    # "layer_1_size": tune.choice([32, 64, 128]),
    # "layer_2_size": tune.choice([64, 128, 256]),
    "lr": tune.loguniform(1e-4, 1e-1),

    # Dataset
    'dataset_dir': 'datasets/data/dpi_dam_break/train',
    'data_split': (0.7, 0.15, 0.15),
    'batch_size': 10,        # care, this is used in the model and datamodule
    'shuffle': True,
    'cache': True,            # Preprocess and preload dataset into memory
    'device': 'cuda'
}

In [24]:
density_data.setup("fit")
# density_data.to('cpu')
torch.cuda.empty_cache()

train_loader = density_data.train_dataloader()
val_loader = density_data.val_dataloader()

Setting up data module for stage  fit


In [25]:
torch.cuda.empty_cache()

In [26]:
train_loader = density_data.train_dataloader()
train_iter = iter(train_loader)
batch = next(train_iter)
sample = batch[0]

# print devices of sample
for key in sample.keys():
    if isinstance(sample[key], torch.Tensor):
        print(key, sample[key].device)

pos cuda:0
vel cuda:0
m cuda:0
viscosity cuda:0
box cuda:0
box_normals cuda:0
density cuda:0
temporal_density_gradient cuda:0


In [27]:
CConvModel

models.cconv.CConvModel

In [41]:
from models.cconv import CConvModel
logger = TensorBoardLogger("lightning_logs", name="cconv-hparam-search", version=".")
datamodule = density_data
lightning_config = (
    LightningConfigBuilder()
    .module(cls=CConvModel, hparams=hparams)
    .trainer(max_epochs=5, logger=logger, accelerator="gpu", enable_progress_bar=False)
    .fit_params(train_dataloaders=train_loader, val_dataloaders=val_loader)
    .checkpointing(monitor="val_loss", mode="min", save_top_k=3)
    .build()
)
lightning_config

{'_module_class': models.cconv.CConvModel,
 '_module_init_config': {'hparams': {'lr': <ray.tune.search.sample.Float at 0x7fbca4e7ada0>,
   'dataset_dir': 'datasets/data/dpi_dam_break/train',
   'data_split': (0.7, 0.15, 0.15),
   'batch_size': 10,
   'shuffle': True,
   'cache': True,
   'device': 'cuda'}},
 '_trainer_init_config': {'max_epochs': 5,
  'logger': <pytorch_lightning.loggers.tensorboard.TensorBoardLogger at 0x7fbaa0265e70>,
  'accelerator': 'gpu',
  'enable_progress_bar': False},
 '_trainer_fit_params': {'train_dataloaders': <torch.utils.data.dataloader.DataLoader at 0x7fbca512a6e0>,
  'val_dataloaders': <torch.utils.data.dataloader.DataLoader at 0x7fbca5129de0>},
 '_strategy_config': {},
 '_model_checkpoint_config': {'monitor': 'val_loss',
  'mode': 'min',
  'save_top_k': 3}}

In [29]:
lightning_config['_trainer_fit_params']

{'train_dataloaders': <torch.utils.data.dataloader.DataLoader at 0x7fbca512a6e0>,
 'val_dataloaders': <torch.utils.data.dataloader.DataLoader at 0x7fbca5129de0>}

In [40]:
import ray, os
ray.shutdown()
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
env = {'working_dir': parent_dir}
ray.init(num_gpus=1, num_cpus=6)

2023-07-04 16:30:18,960	INFO worker.py:1636 -- Started a local Ray instance.


0,1
Python version:,3.10.11
Ray version:,2.5.1


In [42]:
def test(datamodule):
    pass


tune.with_parameters(test, datamodule=datamodule)
run_config = RunConfig(
    checkpoint_config = CheckpointConfig(
        num_to_keep=3,
        checkpoint_score_attribute="val_loss",
        checkpoint_score_order="min",
    ),
)

scaling_config = ScalingConfig(num_workers=1, use_gpu=True, resources_per_worker={"CPU": 2, "GPU": 0.5})
lightning_trainer = LightningTrainer(scaling_config=scaling_config, run_config=run_config)


In [43]:
lightning_trainer

<LightningTrainer scaling_config=ScalingConfig(num_workers=1, use_gpu=True, resources_per_worker={'CPU': 2, 'GPU': 0.5}) run_config=RunConfig(checkpoint_config=CheckpointConfig(num_to_keep=3, checkpoint_score_attribute='val_loss', checkpoint_score_order='min'), verbose=3)>

In [44]:
scheduler = ASHAScheduler(max_t=3, grace_period=1, reduction_factor=2)
tuner = tune.Tuner(
    lightning_trainer,
    param_space={"lightning_config": lightning_config},
    tune_config=tune.TuneConfig(
        metric="val_loss",
        mode="min",
        num_samples=2,
        scheduler=scheduler,
    )
)

  tuner = tune.Tuner(


In [45]:
tuner.fit()


0,1
Current time:,2023-07-04 16:57:10
Running for:,00:26:25.20
Memory:,4.4/12.4 GiB

Trial name,status,loc,...config/_module_in it_config/hparams/lr,iter,total time (s),train_loss,train_loss_step,val_loss
LightningTrainer_5ca9d_00000,TERMINATED,172.28.36.36:20722,0.000350365,3,1580.44,0.212312,0.215088,0.197284
LightningTrainer_5ca9d_00001,TERMINATED,172.28.36.36:20723,0.0483082,3,1577.92,0.971411,0.985242,0.966885


[2m[36m(LightningTrainer pid=20722)[0m 2023-07-04 16:30:50,355	INFO backend_executor.py:137 -- Starting distributed worker processes: ['20853 (172.28.36.36)']
[2m[36m(RayTrainWorker pid=20853)[0m 2023-07-04 16:30:51,062	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=20853)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=20853)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=20853)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=20853)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=20853)[0m You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set

Trial name,_report_on,date,done,epoch,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,step,time_since_restore,time_this_iter_s,time_total_s,timestamp,train_loss,train_loss_epoch,train_loss_step,training_iteration,trial_id,val_loss,val_loss_epoch
LightningTrainer_5ca9d_00000,train_epoch_end,2023-07-04_16-57-09,True,2,DESKTOP-S12DUI1,3,172.28.36.36,20722,True,2052,1580.44,519.196,1580.44,1688482629,0.212312,0.212312,0.215088,3,5ca9d_00000,0.197284,0.197284
LightningTrainer_5ca9d_00001,train_epoch_end,2023-07-04_16-57-07,True,2,DESKTOP-S12DUI1,3,172.28.36.36,20723,True,2052,1577.92,518.311,1577.92,1688482627,0.971411,0.971411,0.985242,3,5ca9d_00001,0.966885,0.966885


2023-07-04 16:57:10,521	INFO tune.py:1111 -- Total run time: 1585.26 seconds (1585.15 seconds for the tuning loop).


ResultGrid<[
  Result(
    metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.212312251329422, 'train_loss_step': 0.21508750319480896, 'val_loss': 0.19728408753871918, 'val_loss_epoch': 0.19728408753871918, 'train_loss_epoch': 0.212312251329422, 'epoch': 2, 'step': 2052, 'should_checkpoint': True, 'done': True, 'trial_id': '5ca9d_00000', 'experiment_tag': '0_lr=0.0004'},
    path='/home/jsem/ray_results/LightningTrainer_2023-07-04_16-30-36/LightningTrainer_5ca9d_00000_0_lr=0.0004_2023-07-04_16-30-45',
    checkpoint=LightningCheckpoint(local_path=/home/jsem/ray_results/LightningTrainer_2023-07-04_16-30-36/LightningTrainer_5ca9d_00000_0_lr=0.0004_2023-07-04_16-30-45/checkpoint_000002)
  ),
  Result(
    metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.9714113473892212, 'train_loss_step': 0.9852420091629028, 'val_loss': 0.9668849110603333, 'val_loss_epoch': 0.9668849110603333, 'train_loss_epoch': 0.9714113473892212, 'epoch': 2, 'step': 2052, 'should_checkpoint': True, 'do