## Prepare Data path and load cfg

By setting the `L5KIT_DATA_FOLDER` variable, we can point the script to the folder where the data lies.

Then, we load our config file with relative paths and other configurations (rasteriser, training params...).

In [1]:
from pathlib import Path
import os

In [2]:
#NOTE: DONT USE RELATIVE PATHS FOR THE MODELS PROVIDED BY L5
experiments_directory = Path(Path(os.path.abspath('')).parent.parent, "Experiments")
experiments_directory.mkdir(parents=True, exist_ok=True)

data_directory = Path(experiments_directory, "data")
data_directory.mkdir(parents=True, exist_ok=True)

prediction_directory = Path(experiments_directory, "prediction")
prediction_directory.mkdir(parents=True, exist_ok=True)

save_directory = Path(prediction_directory, "saved_outputs")
save_directory.mkdir(parents=True, exist_ok=True)

In [3]:
import os
os.chdir(prediction_directory)

In [4]:
%%writefile requirements.txt
l5kit
pyyaml
ray==2.0.0rc1
ray[air]
wandb
optuna

Overwriting requirements.txt


In [5]:
%%capture
# !pip install -r requirements.txt
!pip install l5kit pyyaml wandb
!pip install ray==2.0.0rc1
!pip install "ray[air]"
!pip install optuna

In [6]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33ma-sh0ts[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
from typing import Dict

from tempfile import gettempdir
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision.models.resnet import resnet50
from tqdm import tqdm

from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv, create_chopped_dataset
from l5kit.evaluation.chop_dataset import MIN_FUTURE_STEPS
from l5kit.evaluation.metrics import neg_multi_log_likelihood, time_displace
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory
from prettytable import PrettyTable
from pathlib import Path

import os

In [8]:
# Run information
wandb_entity = "l5-demo"
project_name = "l5-prediction"
run_name = "download-l5-data"
run_type = "download"
run_description = """
Download data for the task of training a prediction model
"""
tags = ["download", "data"]

In [9]:
run = wandb.init(
    entity=wandb_entity,
    project=project_name,
    job_type=run_type,
    name=run_name,
    notes=run_description,
    tags=tags
)

[34m[1mwandb[0m: Currently logged in as: [33ma-sh0ts[0m ([33ml5-demo[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
artifact_entity = "l5-demo"
artifact_project = "l5-common"
artifact_name = "l5-data"
artifact_alias = "latest"
artifact_type = "dataset"

In [11]:
artifact = run.use_artifact(f"{artifact_entity}/{artifact_project}/{artifact_name}:{artifact_alias}", type=artifact_type)

In [12]:
_ = artifact.download(data_directory)

[34m[1mwandb[0m: Downloading large artifact l5-data:latest, 2386.92MB. 517 files... Done. 0:0:0.1


In [13]:
#BUG: need to seperate runs into download and training due to issues with routing runs after ray.tune
run.finish()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [14]:
# Dataset is assumed to be on the folder specified
# in the L5KIT_DATA_FOLDER environment variable

# get config
cfg = load_config_data(Path(data_directory, "configurations", "agent_motion_config.yaml"))
l5_data_location = Path(data_directory, "dataset")
# run.config.update(cfg)

In [15]:
# cfg["zarr_dataset_location"] = l5_data_location
os.environ["L5KIT_DATA_FOLDER"] = str(l5_data_location)

## Model

Our baseline is a simple `resnet50` pretrained on `imagenet`. We must replace the input and the final layer to address our requirements.

In [16]:
def build_model(cfg: Dict) -> torch.nn.Module:
    # load pre-trained Conv2D model
    model = resnet50(pretrained=True)

    # change input channels number to match the rasterizer's output
    num_history_channels = (cfg["model_params"]["history_num_frames"] + 1) * 2
    num_in_channels = 3 + num_history_channels
    model.conv1 = nn.Conv2d(
        num_in_channels,
        model.conv1.out_channels,
        kernel_size=model.conv1.kernel_size,
        stride=model.conv1.stride,
        padding=model.conv1.padding,
        bias=False,
    )
    # change output size to (X, Y) * number of future states
    num_targets = 2 * cfg["model_params"]["future_num_frames"]
    model.fc = nn.Linear(in_features=2048, out_features=num_targets)

    return model

In [17]:
def forward(data, model, criterion):
    inputs = data["image"]
    target_availabilities = data["target_availabilities"].unsqueeze(-1)
    targets = data["target_positions"]
    # Forward pass
    outputs = model(inputs).reshape(targets.shape)
    loss = criterion(outputs, targets)
    # not all the output steps are valid, but we can filter them out from the loss using availabilities
    loss = loss * target_availabilities
    loss = loss.mean()
    return loss, outputs

In [18]:
def train_prediction_model_epoch(data, model, criterion, optimizer):
    loss, outputs = forward(data, model, criterion)
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss, outputs

## Load the Train Data

Our data pipeline map a raw `.zarr` folder into a multi-processing instance ready for training by:
- loading the `zarr` into a `ChunkedDataset` object. This object has a reference to the different arrays into the zarr (e.g. agents and traffic lights);
- wrapping the `ChunkedDataset` into an `AgentDataset`, which inherits from torch `Dataset` class;
- passing the `AgentDataset` into a torch `DataLoader`

In [19]:
import ray.train as train
from ray.air import session, Checkpoint

In [20]:
from ray import tune
from ray.tune.tuner import Tuner

In [21]:
def train_prediction_model(tuner_cfg : Dict):
    # ==== INIT DATASET
    #TODO fix this data flow if it doesnt make sense
    dm = LocalDataManager()
    
    shuffle = tuner_cfg["shuffle"]
    batch_size = int(tuner_cfg["batch_size"])
    num_workers = tuner_cfg["num_workers"]
    lr = tuner_cfg["lr"]
    max_num_steps = int(tuner_cfg["max_num_steps"])
    dataset_key = tuner_cfg["dataset_key"]
    cfg = tuner_cfg["cfg"]
    
    rasterizer = build_rasterizer(cfg, dm)

    train_zarr = ChunkedDataset(dm.require(dataset_key)).open()
    train_dataset = AgentDataset(cfg, train_zarr, rasterizer)

    batch_size_per_worker = batch_size // session.get_world_size()
    train_dataloader = DataLoader(train_dataset, shuffle=shuffle, batch_size=batch_size_per_worker, num_workers=num_workers)
    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    
    # ==== INIT MODEL
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = build_model(cfg)
    model = train.torch.prepare_model(model)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss(reduction="none")

    # ==== TRAIN LOOP
    tr_it = iter(train_dataloader)
    # progress_bar = tqdm(range(cfg["train_params"].get("max_num_steps", 5)))
    progress_bar = range(max_num_steps)
    num_checkpoints = 5
    steps_before_checkpointing = max_num_steps // num_checkpoints
    losses_train = []
    for step in progress_bar:
        try:
            data = next(tr_it)
        except StopIteration:
            tr_it = iter(train_dataloader)
            data = next(tr_it)
            
        model.train()
        torch.set_grad_enabled(True)
        loss, _ = train_prediction_model_epoch(data, model, criterion, optimizer)
        losses_train.append(loss.item())
        avg_loss = np.mean(losses_train)
        metrics = {
            "loss": loss.item(),
            "avg_loss": avg_loss
        }
        
        #NOTE: To prevent stdout bloat
        if train.world_rank() == 0:
            print(metrics)
        # progress_bar.set_description(f"loss: {metrics["loss"]} loss(avg): {metrics["loss(avg)"]}")
        
        #TODO: fix this lmao brain tired
        if steps_before_checkpointing > 0:
            if step%steps_before_checkpointing==0:
                session.report(
                    metrics=metrics,
                    checkpoint=Checkpoint.from_dict(dict(step=step, model=model)),
                )
            elif step==max_num_steps-1:
                session.report(
                    metrics=metrics,
                    checkpoint=Checkpoint.from_dict(dict(step=step, model=model)),
                )
            else:
                session.report(
                    metrics=metrics
                )
        else:
            session.report(
                metrics=metrics
            )

# Training

note: if you're on MacOS and using `py_satellite` rasterizer, you may need to disable opencv multiprocessing by adding:
`cv2.setNumThreads(0)` before the following cell. This seems to only affect running in python notebook and it's caused by the `cv2.warpaffine` function

In [22]:
from ray.train.torch import TorchTrainer
from ray.air.config import RunConfig, ScalingConfig
from ray.air.callbacks.wandb import WandbLoggerCallback

In [23]:
trainer = TorchTrainer(
    train_loop_per_worker=train_prediction_model,
    scaling_config=ScalingConfig(num_workers=3, use_gpu=False), #TODO: Add logic to check if GPU is available here
)

2022-08-19 15:45:55,144	INFO worker.py:1487 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m.


In [24]:
tuner_train_config = {}
##static
tuner_train_config["shuffle"] = cfg["train_data_loader"]["shuffle"]
#TODO: what is this useful for?
tuner_train_config["num_workers"] = cfg["train_data_loader"]["num_workers"]
tuner_train_config["dataset_key"] = cfg["train_data_loader"]["key"]

##tunable
# tuner_train_config["max_num_steps"] = tune.quniform(50, 500, 50)
tuner_train_config["max_num_steps"] = tune.quniform(5, 50, 5)
tuner_train_config["lr"] = tune.loguniform(1e-3, 1e-2)
tuner_train_config["batch_size"] = tune.quniform(6, 24, 3)

tuner_train_config["cfg"] = cfg

In [25]:
from ray.tune.logger import LoggerCallback
from typing import Dict, List

In [26]:
# ## For now passing the current run context to this callback to better organize the models logged
# class TrialEndModelSaveCallback(LoggerCallback):
    
#     def __init__(self, run, save_directory):
#         self.run = run
#         self.save_directory = save_directory
    
#     def on_trial_complete(self, iteration: int, trials: List["Trial"],
#                           trial: "Trial", **info):
#         print("here")
#         trial_name = trial._trainable_name(include_trial_id=True)
#         print(trial_name)
#         final_model_checkpoint = trial.checkpoint.to_air_checkpoint().to_dict()
#         print(final_model_checkpoint)
#         model = final_model_checkpoint["model"]
#         print(model)
#         final_model = torch.jit.script(model.cpu())
#         path_to_save = f"{trial_name}-trained_model"
#         final_model.save(path_to_save)

In [27]:
from ray.tune.stopper import ExperimentPlateauStopper
from ray.tune.search.optuna import OptunaSearch

In [28]:
n_search_attempts = 10

In [29]:
optuna_search = OptunaSearch()

In [30]:
tuner = Tuner(
        trainer,
        tune_config=tune.TuneConfig(
            metric="avg_loss", #loss or avg_loss here?
            mode="min",
            search_alg=optuna_search,
            num_samples=n_search_attempts,
        ),
        param_space={
            "train_loop_config": tuner_train_config
        },
        run_config=RunConfig(
            stop=ExperimentPlateauStopper("avg_loss"),
            callbacks=[WandbLoggerCallback(project=f"{project_name}-trials", save_checkpoints=True), 
                                        # TrialEndModelSaveCallback(run, save_directory)
                                       ]
                            )
    )

  


In [31]:
analysis = tuner.fit()

[2m[36m(RayTrainWorker pid=16406)[0m {'loss': 80.25357055664062, 'avg_loss': 71.58842504435572}
Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 7.779364585876465
  _timestamp: 1660924500
  _training_iteration: 29
  avg_loss: 71.58842504435572
  date: 2022-08-19_15-55-01
  done: false
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  hostname: anish-l5-kit
  iterations_since_restore: 29
  loss: 80.25357055664062
  node_ip: 10.150.0.3
  pid: 16328
  time_since_restore: 248.99855041503906
  time_this_iter_s: 7.430035829544067
  time_total_s: 248.99855041503906
  timestamp: 1660924501
  timesteps_since_restore: 0
  training_iteration: 29
  trial_id: 5c2b750a
  warmup_time: 0.005020856857299805
  
Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 7.822077035903931
  _timestamp: 1660924508
  _training_iteration: 30
  avg_loss: 71.98505894343059
  date: 2022-08-19_15-55-08
  done: false
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  hostname: anish-l5-kit
  iterations_si

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_5c2b750a_3_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-50-48/checkpoint_000004)... 

Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 8.145943641662598
  _timestamp: 1660924560
  _training_iteration: 37
  avg_loss: 68.69699362161997
  date: 2022-08-19_15-56-00
  done: false
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  hostname: anish-l5-kit
  iterations_since_restore: 37
  loss: 67.09259033203125
  node_ip: 10.150.0.3
  pid: 16328
  should_checkpoint: true
  time_since_restore: 308.62834787368774
  time_this_iter_s: 8.510472297668457
  time_total_s: 308.62834787368774
  timestamp: 1660924560
  timesteps_since_restore: 0
  training_iteration: 37
  trial_id: 5c2b750a
  warmup_time: 0.005020856857299805
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=16406)[0m {'loss': 36.63917541503906, 'avg_loss': 67.85336682670994}
Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 8.376948118209839
  _timestamp: 1660924568
  _training_iteration: 38
  avg_loss: 67.85336682670994
  date: 2022-08-19_15-56-08
  done: false
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  hostname: anish-l5-kit
  iterations_since_restore: 38
  loss: 36.63917541503906
  node_ip: 10.150.0.3
  pid: 16328
  time_since_restore: 316.6358058452606
  time_this_iter_s: 8.007457971572876
  time_total_s: 316.6358058452606
  timestamp: 1660924568
  timesteps_since_restore: 0
  training_iteration: 38
  trial_id: 5c2b750a
  warmup_time: 0.005020856857299805
  
Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 7.18516206741333
  _timestamp: 1660924575
  _training_iteration: 39
  avg_loss: 67.7737532151051
  date: 2022-08-19_15-56-15
  done: false
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  hostname: anish-l5-kit
  iterations_since_

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_5c2b750a_3_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-50-48/checkpoint_000005)... 

Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 7.601074934005737
  _timestamp: 1660924618
  _training_iteration: 45
  avg_loss: 70.17650356292725
  date: 2022-08-19_15-56-59
  done: false
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  hostname: anish-l5-kit
  iterations_since_restore: 45
  loss: 41.149383544921875
  node_ip: 10.150.0.3
  pid: 16328
  should_checkpoint: true
  time_since_restore: 367.3895263671875
  time_this_iter_s: 8.216518878936768
  time_total_s: 367.3895263671875
  timestamp: 1660924619
  timesteps_since_restore: 0
  training_iteration: 45
  trial_id: 5c2b750a
  warmup_time: 0.005020856857299805
  


Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_5c2b750a_3_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-50-48/checkpoint_000005)... Done. 0.3s


VBox(children=(Label(value='544.901 MB of 544.901 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▂▁▁▅▅▆▇█▇▇▇▆▆▆▆▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
iterations_since_restore,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,▂▁▁█▃▆█▇▂▅▁▂▄▁▄▇▄▄▂▄▂▁▆▂▁▄▄▃▂▂▂▅▃▂▃▆▃▅▃▂
time_since_restore,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
time_this_iter_s,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time_total_s,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
warmup_time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,70.1765
iterations_since_restore,45.0
loss,41.14938
time_since_restore,367.38953
time_this_iter_s,8.21652
time_total_s,367.38953
timestamp,1660924619.0
timesteps_since_restore,0.0
training_iteration,45.0
warmup_time,0.00502


Result for TorchTrainer_5c2b750a:
  _time_this_iter_s: 7.601074934005737
  _timestamp: 1660924618
  _training_iteration: 45
  avg_loss: 70.17650356292725
  date: 2022-08-19_15-56-59
  done: true
  experiment_id: eae92d0a63fc4567a5392ff8a089dc67
  experiment_tag: 3_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0013,max_num_steps=45.0000,num_workers=16,shuffle=True
  host

[2m[36m(RayTrainWorker pid=18437)[0m 2022-08-19 15:57:19,295	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=18438)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=18438)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=18439)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=18439)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=18437)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=18437)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=18437)[0m 2022-08-19 15:57:28,762	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=18437)[0m 2022-08-19 15:57:28,764	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 0.05597818270325661, 'avg_loss': 0.05597818270325661}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000000)... 

Result for TorchTrainer_b210d294:
  _time_this_iter_s: 22.098339557647705
  _timestamp: 1660924663
  _training_iteration: 1
  avg_loss: 0.05597818270325661
  date: 2022-08-19_15-57-43
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 0.05597818270325661
  node_ip: 10.150.0.3
  pid: 18371
  should_checkpoint: true
  time_since_restore: 27.949078798294067
  time_this_iter_s: 27.949078798294067
  time_total_s: 27.949078798294067
  timestamp: 1660924663
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 55.00377655029297, 'avg_loss': 27.529877366498113}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.160595417022705
  _timestamp: 1660924670
  _training_iteration: 3
  avg_loss: 28.282128454496462
  date: 2022-08-19_15-57-50
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 3
  loss: 29.786630630493164
  node_ip: 10.150.0.3
  pid: 18371
  time_since_restore: 34.59441375732422
  time_this_iter_s: 3.054212808609009
  time_total_s: 34.59441375732422
  timestamp: 1660924670
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 29.786630630493164, 'avg_loss': 28.282128454496462}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 20.87142562866211, 'avg_loss': 26.429452748037875}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.0282301902770996
  _timestamp:

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000001)... 

Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.2043275833129883
  _timestamp: 1660924689
  _training_iteration: 9
  avg_loss: 27.76012138567037
  date: 2022-08-19_15-58-09
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 9
  loss: 23.193696975708008
  node_ip: 10.150.0.3
  pid: 18371
  should_checkpoint: true
  time_since_restore: 54.14268231391907
  time_this_iter_s: 3.6067657470703125
  time_total_s: 54.14268231391907
  timestamp: 1660924689
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 241.56051635742188, 'avg_loss': 49.14016088284552}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 60.305362701416016, 'avg_loss': 50.15517922998829}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.008502244949341
  _timestamp: 1660924696
  _training_iteration: 11
  avg_loss: 50.15517922998829
  date: 2022-08-19_15-58-16
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 11
  loss: 60.305362701416016
  node_ip: 10.150.0.3
  pid: 18371
  time_since_restore: 60.565850496292114
  time_this_iter_s: 2.964409828186035
  time_total_s: 60.565850496292114
  timestamp: 1660924696
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 66.62116241455078, 'avg_loss': 51.5273444953685}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.031351327896118
  _timestamp:

[2m[36m(RayTrainWorker pid=18438)[0m E0819 15:58:28.294645162   18495 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"
[2m[36m(RayTrainWorker pid=18437)[0m E0819 15:58:28.870993948   18526 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 272.6679382324219, 'avg_loss': 67.68197158747353}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 396.6945495605469, 'avg_loss': 87.03565264471314}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000002)... 

Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.3343183994293213
  _timestamp: 1660924714
  _training_iteration: 17
  avg_loss: 87.03565264471314
  date: 2022-08-19_15-58-34
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 17
  loss: 396.6945495605469
  node_ip: 10.150.0.3
  pid: 18371
  should_checkpoint: true
  time_since_restore: 79.45311427116394
  time_this_iter_s: 3.6386425495147705
  time_total_s: 79.45311427116394
  timestamp: 1660924714
  timesteps_since_restore: 0
  training_iteration: 17
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 44.2292594909668, 'avg_loss': 84.65751969172723}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 254.65872192382812, 'avg_loss': 93.6049513881536}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.1123738288879395
  _timestamp: 1660924721
  _training_iteration: 19
  avg_loss: 93.6049513881536
  date: 2022-08-19_15-58-41
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 19
  loss: 254.65872192382812
  node_ip: 10.150.0.3
  pid: 18371
  time_since_restore: 85.8903923034668
  time_this_iter_s: 3.0883049964904785
  time_total_s: 85.8903923034668
  timestamp: 1660924721
  timesteps_since_restore: 0
  training_iteration: 19
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 110.3552474975586, 'avg_loss': 94.44246619362384}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 446.8946838378906, 'avg_loss': 111.2259051

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000003)... 

Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.2591047286987305
  _timestamp: 1660924739
  _training_iteration: 25
  avg_loss: 97.09774805560708
  date: 2022-08-19_15-59-00
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 25
  loss: 16.490636825561523
  node_ip: 10.150.0.3
  pid: 18371
  should_checkpoint: true
  time_since_restore: 104.6502296924591
  time_this_iter_s: 3.6366961002349854
  time_total_s: 104.6502296924591
  timestamp: 1660924740
  timesteps_since_restore: 0
  training_iteration: 25
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 13.709905624389648, 'avg_loss': 93.8905233467141}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 11.674281120300293, 'avg_loss': 90.84547733832841}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.0245563983917236
  _timestamp: 1660924746
  _training_iteration: 27
  avg_loss: 90.84547733832841
  date: 2022-08-19_15-59-07
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 27
  loss: 11.674281120300293
  node_ip: 10.150.0.3
  pid: 18371
  time_since_restore: 111.48789548873901
  time_this_iter_s: 3.007246494293213
  time_total_s: 111.48789548873901
  timestamp: 1660924747
  timesteps_since_restore: 0
  training_iteration: 27
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 282.9053955078125, 'avg_loss': 97.70476013009569}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.0371286869049072
  _timestam

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000004)... 

Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.2090699672698975
  _timestamp: 1660924765
  _training_iteration: 33
  avg_loss: 91.37420106306672
  date: 2022-08-19_15-59-25
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 33
  loss: 54.12888717651367
  node_ip: 10.150.0.3
  pid: 18371
  should_checkpoint: true
  time_since_restore: 130.25680255889893
  time_this_iter_s: 3.603100299835205
  time_total_s: 130.25680255889893
  timestamp: 1660924765
  timesteps_since_restore: 0
  training_iteration: 33
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 18.94083595275879, 'avg_loss': 89.24380797158707}
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 75.70962524414062, 'avg_loss': 88.85711703651718}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 2.897050142288208
  _timestamp: 1660924772
  _training_iteration: 35
  avg_loss: 88.85711703651718
  date: 2022-08-19_15-59-32
  done: false
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  hostname: anish-l5-kit
  iterations_since_restore: 35
  loss: 75.70962524414062
  node_ip: 10.150.0.3
  pid: 18371
  time_since_restore: 136.59838557243347
  time_this_iter_s: 2.977599620819092
  time_total_s: 136.59838557243347
  timestamp: 1660924772
  timesteps_since_restore: 0
  training_iteration: 35
  trial_id: b210d294
  warmup_time: 0.004952669143676758
  
[2m[36m(RayTrainWorker pid=18437)[0m {'loss': 296.2373962402344, 'avg_loss': 94.61768034773155}
Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.0586187839508057
  _timestamp: 

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000005)... Done. 1.1s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_b210d294_4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_15-57-10/checkpoint_000005)... Done. 0.2s


VBox(children=(Label(value='544.878 MB of 544.878 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▁▃▃▃▃▃▃▃▃▄▄▄▅▅▄▅▆▆▇▇███▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
iterations_since_restore,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▁▂▁▁▁▁▂▁▁▅▂▂▃▂▁▅▇▂▅▃█▁▂▁▁▁▁▅▃▂▂▁▂▁▂▆▂▃▂▂
time_since_restore,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
time_this_iter_s,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time_total_s,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
warmup_time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,92.68652
iterations_since_restore,40.0
loss,94.94113
time_since_restore,152.46501
time_this_iter_s,3.59566
time_total_s,152.46501
timestamp,1660924788.0
timesteps_since_restore,0.0
training_iteration,40.0
warmup_time,0.00495


Result for TorchTrainer_b210d294:
  _time_this_iter_s: 3.1542234420776367
  _timestamp: 1660924787
  _training_iteration: 40
  avg_loss: 92.68651543566958
  date: 2022-08-19_15-59-48
  done: true
  experiment_id: 7e0c73ca27d24576a80ec06b655889a1
  experiment_tag: 4_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0049,max_num_steps=40.0000,num_workers=16,shuffle=True
  host

[2m[36m(RayTrainWorker pid=19586)[0m 2022-08-19 16:00:05,558	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=19586)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=19586)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=19586)[0m 2022-08-19 16:00:12,965	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=19586)[0m 2022-08-19 16:00:12,967	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=19587)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=19587)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=19588)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=19588)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 0.1509147733449936, 'avg_loss': 0.1509147733449936}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000000)... 

Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 27.314635753631592
  _timestamp: 1660924834
  _training_iteration: 1
  avg_loss: 0.1509147733449936
  date: 2022-08-19_16-00-34
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 0.1509147733449936
  node_ip: 10.150.0.3
  pid: 19519
  should_checkpoint: true
  time_since_restore: 33.83841395378113
  time_this_iter_s: 33.83841395378113
  time_total_s: 33.83841395378113
  timestamp: 1660924834
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 35.1248664855957, 'avg_loss': 17.63789062947035}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.694731950759888
  _timestamp: 1660924840
  _training_iteration: 2
  avg_loss: 17.63789062947035
  date: 2022-08-19_16-00-40
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 2
  loss: 35.1248664855957
  node_ip: 10.150.0.3
  pid: 19519
  time_since_restore: 39.16262364387512
  time_this_iter_s: 5.324209690093994
  time_total_s: 39.16262364387512
  timestamp: 1660924840
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 6.746425151824951, 'avg_loss': 14.007402136921883}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.809437990188599
  _timestamp: 1660924849
  _training_iteration: 4
  avg_loss: 14.949169423431158
  date: 2022-08-19_16-00-49
  done: fa

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000001)... 

Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.368699073791504
  _timestamp: 1660924865
  _training_iteration: 7
  avg_loss: 57.88644424293722
  date: 2022-08-19_16-01-05
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 7
  loss: 91.57981872558594
  node_ip: 10.150.0.3
  pid: 19519
  should_checkpoint: true
  time_since_restore: 64.29893851280212
  time_this_iter_s: 5.795738697052002
  time_total_s: 64.29893851280212
  timestamp: 1660924865
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 56.08512878417969, 'avg_loss': 57.66127981059253}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.429559230804443
  _timestamp: 1660924870
  _training_iteration: 8
  avg_loss: 57.66127981059253
  date: 2022-08-19_16-01-10
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 8
  loss: 56.08512878417969
  node_ip: 10.150.0.3
  pid: 19519
  time_since_restore: 69.38658785820007
  time_this_iter_s: 5.087649345397949
  time_total_s: 69.38658785820007
  timestamp: 1660924870
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 17.661470413208008, 'avg_loss': 53.216856544216476}
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 182.89297485351562, 'avg_loss': 66.1844683751464}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.669770002365112
  _timestamp: 166

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000002)... 

Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.996649265289307
  _timestamp: 1660924894
  _training_iteration: 13
  avg_loss: 76.6893324565429
  date: 2022-08-19_16-01-34
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 13
  loss: 133.1661376953125
  node_ip: 10.150.0.3
  pid: 19519
  should_checkpoint: true
  time_since_restore: 93.65966534614563
  time_this_iter_s: 5.356880187988281
  time_total_s: 93.65966534614563
  timestamp: 1660924894
  timesteps_since_restore: 0
  training_iteration: 13
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 65.8344497680664, 'avg_loss': 75.91398369308028}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.441608190536499
  _timestamp: 1660924899
  _training_iteration: 14
  avg_loss: 75.91398369308028
  date: 2022-08-19_16-01-39
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 14
  loss: 65.8344497680664
  node_ip: 10.150.0.3
  pid: 19519
  time_since_restore: 98.74598908424377
  time_this_iter_s: 5.0863237380981445
  time_total_s: 98.74598908424377
  timestamp: 1660924899
  timesteps_since_restore: 0
  training_iteration: 14
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 96.32621765136719, 'avg_loss': 77.27479929029941}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.88898491859436
  _timestamp: 1660924909
  _training_iteration: 16
  avg_loss: 75.51531834062189
  date: 2022-08-19_16-01-49
  done: 

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000003)... 

Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.296213865280151
  _timestamp: 1660924924
  _training_iteration: 19
  avg_loss: 73.05969410742584
  date: 2022-08-19_16-02-04
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 19
  loss: 2.8612701892852783
  node_ip: 10.150.0.3
  pid: 19519
  should_checkpoint: true
  time_since_restore: 123.49116849899292
  time_this_iter_s: 5.735377311706543
  time_total_s: 123.49116849899292
  timestamp: 1660924924
  timesteps_since_restore: 0
  training_iteration: 19
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  


Done. 1.6s


[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 95.01958465576172, 'avg_loss': 74.15768863484263}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.622907400131226
  _timestamp: 1660924934
  _training_iteration: 21
  avg_loss: 73.30100569767612
  date: 2022-08-19_16-02-14
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 21
  loss: 56.1673469543457
  node_ip: 10.150.0.3
  pid: 19519
  time_since_restore: 133.05340480804443
  time_this_iter_s: 4.636565446853638
  time_total_s: 133.05340480804443
  timestamp: 1660924934
  timesteps_since_restore: 0
  training_iteration: 21
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 56.1673469543457, 'avg_loss': 73.30100569767612}
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 183.33221435546875, 'avg_loss': 78.30242427303033}
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 192.855224609375, 'avg_loss': 83.282980

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000004)... 

Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.896080017089844
  _timestamp: 1660924953
  _training_iteration: 25
  avg_loss: 84.93599374234677
  date: 2022-08-19_16-02-33
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 25
  loss: 158.32278442382812
  node_ip: 10.150.0.3
  pid: 19519
  should_checkpoint: true
  time_since_restore: 152.52749109268188
  time_this_iter_s: 5.329237937927246
  time_total_s: 152.52749109268188
  timestamp: 1660924953
  timesteps_since_restore: 0
  training_iteration: 25
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  


Done. 1.0s


Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.570523738861084
  _timestamp: 1660924958
  _training_iteration: 26
  avg_loss: 86.10326981028685
  date: 2022-08-19_16-02-38
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 26
  loss: 115.28517150878906
  node_ip: 10.150.0.3
  pid: 19519
  time_since_restore: 157.66281366348267
  time_this_iter_s: 5.135322570800781
  time_total_s: 157.66281366348267
  timestamp: 1660924958
  timesteps_since_restore: 0
  training_iteration: 26
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 115.28517150878906, 'avg_loss': 86.10326981028685}
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 76.70806121826172, 'avg_loss': 85.75529912169333}
[2m[36m(RayTrainWorker pid=19586)[0m {'loss': 36.469390869140625, 'avg_loss': 83.99508811267359}
Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 4.595788955688477
  _timestam

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000005)... 

Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.03687047958374
  _timestamp: 1660924977
  _training_iteration: 30
  avg_loss: 82.29661618421476
  date: 2022-08-19_16-02-58
  done: false
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  hostname: anish-l5-kit
  iterations_since_restore: 30
  loss: 106.94239807128906
  node_ip: 10.150.0.3
  pid: 19519
  should_checkpoint: true
  time_since_restore: 177.16992354393005
  time_this_iter_s: 5.481915473937988
  time_total_s: 177.16992354393005
  timestamp: 1660924978
  timesteps_since_restore: 0
  training_iteration: 30
  trial_id: 95b103b6
  warmup_time: 0.0052759647369384766
  


Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_95b103b6_5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_15-59-57/checkpoint_000005)... Done. 0.3s


VBox(children=(Label(value='544.904 MB of 544.904 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▁▂▂▂▂▅▆▆▅▆▇▇▇▇▇▇▇▇▇▇▇▇████████
iterations_since_restore,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,▁▂▁▂▂█▄▃▂▆▅▃▅▃▄▂▄▃▁▄▃▇▇▂▆▄▃▂▁▄
time_since_restore,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
time_this_iter_s,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time_total_s,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
timestamp,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
warmup_time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,82.29662
iterations_since_restore,30.0
loss,106.9424
time_since_restore,177.16992
time_this_iter_s,5.48192
time_total_s,177.16992
timestamp,1660924978.0
timesteps_since_restore,0.0
training_iteration,30.0
warmup_time,0.00528


Result for TorchTrainer_95b103b6:
  _time_this_iter_s: 5.03687047958374
  _timestamp: 1660924977
  _training_iteration: 30
  avg_loss: 82.29661618421476
  date: 2022-08-19_16-02-58
  done: true
  experiment_id: 90c2da4dc9894fb096d39c57feec1b5e
  experiment_tag: 5_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0022,max_num_steps=30.0000,num_workers=16,shuffle=True
  hostn

[2m[36m(RayTrainWorker pid=20992)[0m 2022-08-19 16:03:14,272	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=20992)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=20992)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=20992)[0m 2022-08-19 16:03:22,021	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=20992)[0m 2022-08-19 16:03:22,023	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=20994)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=20994)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=20993)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=20993)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 0.07437949627637863, 'avg_loss': 0.07437949627637863}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000000)... 

Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 22.077595472335815
  _timestamp: 1660925017
  _training_iteration: 1
  avg_loss: 0.07437949627637863
  date: 2022-08-19_16-03-38
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 0.07437949627637863
  node_ip: 10.150.0.3
  pid: 20925
  should_checkpoint: true
  time_since_restore: 28.142361879348755
  time_this_iter_s: 28.142361879348755
  time_total_s: 28.142361879348755
  timestamp: 1660925018
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 51.2484130859375, 'avg_loss': 25.66139629110694}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.1629509925842285
  _timestamp: 1660925025
  _training_iteration: 3
  avg_loss: 23.851089018086594
  date: 2022-08-19_16-03-45
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 3
  loss: 20.2304744720459
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 34.948413133621216
  time_this_iter_s: 3.1647346019744873
  time_total_s: 34.948413133621216
  timestamp: 1660925025
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 20.2304744720459, 'avg_loss': 23.851089018086594}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 9.91662883758545, 'avg_loss': 20.367473972961307}
Result for TorchTrainer_f95a0714:[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 15.28926

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000001)... Done. 1.2s


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 60.74839782714844, 'avg_loss': 59.994856640075646}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 82.43293762207031, 'avg_loss': 61.72086286945985}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 2.943763017654419
  _timestamp: 1660925057
  _training_iteration: 13
  avg_loss: 61.72086286945985
  date: 2022-08-19_16-04-17
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 13
  loss: 82.43293762207031
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 66.93296599388123
  time_this_iter_s: 3.039665460586548
  time_total_s: 66.93296599388123
  timestamp: 1660925057
  timesteps_since_restore: 0
  training_iteration: 13
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 29.4619140625, 'avg_loss': 59.41665224039129}


[2m[36m(RayTrainWorker pid=20994)[0m E0819 16:04:23.068759994   21038 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 35.433048248291016, 'avg_loss': 57.8177453075846}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.043977737426758
  _timestamp: 1660925063
  _training_iteration: 15
  avg_loss: 57.8177453075846
  date: 2022-08-19_16-04-23
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 15
  loss: 35.433048248291016
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 73.00475525856018
  time_this_iter_s: 3.0672671794891357
  time_total_s: 73.00475525856018
  timestamp: 1660925063
  timesteps_since_restore: 0
  training_iteration: 15
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  


[2m[36m(RayTrainWorker pid=20993)[0m E0819 16:04:23.794715976   21055 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 348.2391052246094, 'avg_loss': 75.96908030239865}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 220.70556640625, 'avg_loss': 84.48299124968402}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.0076141357421875
  _timestamp: 1660925069
  _training_iteration: 17
  avg_loss: 84.48299124968402
  date: 2022-08-19_16-04-29
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 17
  loss: 220.70556640625
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 79.20130372047424
  time_this_iter_s: 3.053624391555786
  time_total_s: 79.20130372047424
  timestamp: 1660925069
  timesteps_since_restore: 0
  training_iteration: 17
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 38.35962677001953, 'avg_loss': 81.92058211192489}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.1442136764526367
  _timestamp: 16609

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000002)... Done. 1.1s


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 10.852103233337402, 'avg_loss': 103.5803328667852}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 34.240535736083984, 'avg_loss': 100.56555907849385}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.0159080028533936
  _timestamp: 1660925088
  _training_iteration: 23
  avg_loss: 100.56555907849385
  date: 2022-08-19_16-04-48
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 23
  loss: 34.240535736083984
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 98.36353158950806
  time_this_iter_s: 3.0031728744506836
  time_total_s: 98.36353158950806
  timestamp: 1660925088
  timesteps_since_restore: 0
  training_iteration: 23
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 29.209794998168945, 'avg_loss': 97.59240224181364}
Result for TorchTrainer_f95a0714:[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000003)... Done. 1.0s


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 3.161508560180664, 'avg_loss': 90.99901475175284}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 69.81918334960938, 'avg_loss': 90.3572016789606}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.1085691452026367
  _timestamp: 1660925120
  _training_iteration: 33
  avg_loss: 90.3572016789606
  date: 2022-08-19_16-05-20
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 33
  loss: 69.81918334960938
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 130.38098526000977
  time_this_iter_s: 3.1981353759765625
  time_total_s: 130.38098526000977
  timestamp: 1660925120
  timesteps_since_restore: 0
  training_iteration: 33
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 10.24853801727295, 'avg_loss': 88.0010645124404}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.149775505065918
  _timestamp: 16

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000004)... Done. 1.0s


[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 13.470518112182617, 'avg_loss': 96.2502840485956}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 12.530905723571777, 'avg_loss': 94.30332176196714}
Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 2.974734306335449
  _timestamp: 1660925151
  _training_iteration: 43
  avg_loss: 94.30332176196714
  date: 2022-08-19_16-05-51
  done: false
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  hostname: anish-l5-kit
  iterations_since_restore: 43
  loss: 12.530905723571777
  node_ip: 10.150.0.3
  pid: 20925
  time_since_restore: 161.6743221282959
  time_this_iter_s: 3.0516881942749023
  time_total_s: 161.6743221282959
  timestamp: 1660925151
  timesteps_since_restore: 0
  training_iteration: 43
  trial_id: f95a0714
  warmup_time: 0.005228996276855469
  
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 123.29106903076172, 'avg_loss': 94.9621341998943}
[2m[36m(RayTrainWorker pid=20992)[0m {'loss': 107.22435760498047, 'avg_loss': 95.234

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000005)... Done. 1.6s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_f95a0714_6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-03-07/checkpoint_000005)... Done. 0.2s


VBox(children=(Label(value='544.948 MB of 544.948 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▁▃▃▂▂▂▂▃▅▅▅▅▆▆▆▇███▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
iterations_since_restore,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,▁▂▁▁▁▂▁▂▅▂▃▂█▅▂▆█▁▂▂▂▁▆▄▂▁▂▁█▃▄▂▅▁▁▃▂▂▁▁
time_since_restore,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
time_this_iter_s,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time_total_s,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
warmup_time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,90.69437
iterations_since_restore,50.0
loss,24.70192
time_since_restore,184.226
time_this_iter_s,3.55457
time_total_s,184.226
timestamp,1660925174.0
timesteps_since_restore,0.0
training_iteration,50.0
warmup_time,0.00523


Result for TorchTrainer_f95a0714:
  _time_this_iter_s: 3.2022571563720703
  _timestamp: 1660925174
  _training_iteration: 50
  avg_loss: 90.6943697462976
  date: 2022-08-19_16-06-14
  done: true
  experiment_id: 2a59d13827134d63b5a4e79e3ac1cecf
  experiment_tag: 6_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0038,max_num_steps=50.0000,num_workers=16,shuffle=True
  hostn

[2m[36m(RayTrainWorker pid=22322)[0m 2022-08-19 16:06:32,962	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=22323)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=22323)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=22324)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=22324)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=22322)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=22322)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=22322)[0m 2022-08-19 16:06:41,730	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=22322)[0m 2022-08-19 16:06:41,732	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 23.46404457092285, 'avg_loss': 23.46404457092285}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000000)... 

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 37.16614580154419
  _timestamp: 1660925230
  _training_iteration: 1
  avg_loss: 23.46404457092285
  date: 2022-08-19_16-07-11
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 23.46404457092285
  node_ip: 10.150.0.3
  pid: 22254
  should_checkpoint: true
  time_since_restore: 42.31917643547058
  time_this_iter_s: 42.31917643547058
  time_total_s: 42.31917643547058
  timestamp: 1660925231
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  


Done. 1.2s


Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 8.33419156074524
  _timestamp: 1660925239
  _training_iteration: 2
  avg_loss: 15.82620620727539
  date: 2022-08-19_16-07-19
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 2
  loss: 8.18836784362793
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 50.036550998687744
  time_this_iter_s: 7.717374563217163
  time_total_s: 50.036550998687744
  timestamp: 1660925239
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 8.18836784362793, 'avg_loss': 15.82620620727539}
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 12.313817024230957, 'avg_loss': 14.655409812927246}
Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.212074518203735
  _timestamp: 1660925246
  _training_iteration: 3
  avg_loss: 14.655409812927246
  date: 2022-08-19_16-07-26
  done: 

[2m[36m(RayTrainWorker pid=22324)[0m E0819 16:07:41.902977496   22362 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.285120487213135
  _timestamp: 1660925268
  _training_iteration: 6
  avg_loss: 68.41846799850464
  date: 2022-08-19_16-07-48
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 6
  loss: 138.44940185546875
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 79.09725451469421
  time_this_iter_s: 7.315348386764526
  time_total_s: 79.09725451469421
  timestamp: 1660925268
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 138.44940185546875, 'avg_loss': 68.41846799850464}
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 161.7787322998047, 'avg_loss': 81.75564861297607}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000001)... 

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.473365783691406
  _timestamp: 1660925275
  _training_iteration: 7
  avg_loss: 81.75564861297607
  date: 2022-08-19_16-07-56
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 7
  loss: 161.7787322998047
  node_ip: 10.150.0.3
  pid: 22254
  should_checkpoint: true
  time_since_restore: 87.09487724304199
  time_this_iter_s: 7.997622728347778
  time_total_s: 87.09487724304199
  timestamp: 1660925276
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  


Done. 1.1s


Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 8.884138584136963
  _timestamp: 1660925284
  _training_iteration: 8
  avg_loss: 87.81594717502594
  date: 2022-08-19_16-08-04
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 8
  loss: 130.238037109375
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 95.42925834655762
  time_this_iter_s: 8.334381103515625
  time_total_s: 95.42925834655762
  timestamp: 1660925284
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 130.238037109375, 'avg_loss': 87.81594717502594}
Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.3057451248168945
  _timestamp: 1660925291
  _training_iteration: 9
  avg_loss: 81.75838650597467
  date: 2022-08-19_16-08-11
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_re

[2m[36m(RayTrainWorker pid=22322)[0m E0819 16:08:32.546431265   22580 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.613554239273071
  _timestamp: 1660925314
  _training_iteration: 12
  avg_loss: 76.03753225008647
  date: 2022-08-19_16-08-34
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 12
  loss: 17.452350616455078
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 124.92431950569153
  time_this_iter_s: 7.617091417312622
  time_total_s: 124.92431950569153
  timestamp: 1660925314
  timesteps_since_restore: 0
  training_iteration: 12
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 17.452350616455078, 'avg_loss': 76.03753225008647}
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 53.58009338378906, 'avg_loss': 74.31003695267897}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000002)... 

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 9.93157434463501
  _timestamp: 1660925324
  _training_iteration: 13
  avg_loss: 74.31003695267897
  date: 2022-08-19_16-08-44
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 13
  loss: 53.58009338378906
  node_ip: 10.150.0.3
  pid: 22254
  should_checkpoint: true
  time_since_restore: 135.59925031661987
  time_this_iter_s: 10.674930810928345
  time_total_s: 135.59925031661987
  timestamp: 1660925324
  timesteps_since_restore: 0
  training_iteration: 13
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  


Done. 1.2s


Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 10.900946617126465
  _timestamp: 1660925334
  _training_iteration: 14
  avg_loss: 80.47588191713605
  date: 2022-08-19_16-08-54
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 14
  loss: 160.63186645507812
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 145.76004838943481
  time_this_iter_s: 10.160798072814941
  time_total_s: 145.76004838943481
  timestamp: 1660925334
  timesteps_since_restore: 0
  training_iteration: 14
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 160.63186645507812, 'avg_loss': 80.47588191713605}
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 72.17362213134766, 'avg_loss': 79.92239793141682}
Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 8.554540395736694
  _timestamp: 1660925343
  _training_iteration: 15
  avg_loss: 79.92239793141682
  date: 2022-08-19_16-09-03


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000003)... 

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.708553791046143
  _timestamp: 1660925373
  _training_iteration: 19
  avg_loss: 74.18942105142693
  date: 2022-08-19_16-09-33
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 19
  loss: 41.650203704833984
  node_ip: 10.150.0.3
  pid: 22254
  should_checkpoint: true
  time_since_restore: 184.53926539421082
  time_this_iter_s: 8.195169687271118
  time_total_s: 184.53926539421082
  timestamp: 1660925373
  timesteps_since_restore: 0
  training_iteration: 19
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 60.612918853759766, 'avg_loss': 73.51059594154358}
Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 8.049840688705444
  _timestamp: 1660925381
  _training_iteration: 20
  avg_loss: 73.51059594154358
  date: 2022-08-19_16-09-41
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 20
  loss: 60.612918853759766
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 192.19243550300598
  time_this_iter_s: 7.653170108795166
  time_total_s: 192.19243550300598
  timestamp: 1660925381
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.267251253128052
  _timestamp: 1660925388
  _training_iteration: 21
  avg_loss: 72.35059960683186
  date: 2022-08-19_16-09-48
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000004)... 

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.510915279388428
  _timestamp: 1660925418
  _training_iteration: 25
  avg_loss: 73.81199203491211
  date: 2022-08-19_16-10-18
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 25
  loss: 177.4814910888672
  node_ip: 10.150.0.3
  pid: 22254
  should_checkpoint: true
  time_since_restore: 229.66177582740784
  time_this_iter_s: 7.984999656677246
  time_total_s: 229.66177582740784
  timestamp: 1660925418
  timesteps_since_restore: 0
  training_iteration: 25
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  


Done. 1.1s


Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.9995033740997314
  _timestamp: 1660925426
  _training_iteration: 26
  avg_loss: 71.80168063823993
  date: 2022-08-19_16-10-26
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 26
  loss: 21.543895721435547
  node_ip: 10.150.0.3
  pid: 22254
  time_since_restore: 237.26024317741394
  time_this_iter_s: 7.5984673500061035
  time_total_s: 237.26024317741394
  timestamp: 1660925426
  timesteps_since_restore: 0
  training_iteration: 26
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 21.543895721435547, 'avg_loss': 71.80168063823993}
[2m[36m(RayTrainWorker pid=22322)[0m {'loss': 108.4175796508789, 'avg_loss': 73.15782504611545}
Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.126520156860352
  _timestamp: 1660925433
  _training_iteration: 27
  avg_loss: 73.15782504611545
  date: 2022-08-19_16-10-33


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000005)... 

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.444295167922974
  _timestamp: 1660925455
  _training_iteration: 30
  avg_loss: 70.7570322672526
  date: 2022-08-19_16-10-55
  done: false
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  hostname: anish-l5-kit
  iterations_since_restore: 30
  loss: 40.30189895629883
  node_ip: 10.150.0.3
  pid: 22254
  should_checkpoint: true
  time_since_restore: 266.65030217170715
  time_this_iter_s: 7.902245044708252
  time_total_s: 266.65030217170715
  timestamp: 1660925455
  timesteps_since_restore: 0
  training_iteration: 30
  trial_id: 6a04551e
  warmup_time: 0.0055086612701416016
  


Done. 1.1s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_6a04551e_7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-06-26/checkpoint_000005)... Done. 0.3s


VBox(children=(Label(value='544.978 MB of 544.978 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

Result for TorchTrainer_6a04551e:
  _time_this_iter_s: 7.444295167922974
  _timestamp: 1660925455
  _training_iteration: 30
  avg_loss: 70.7570322672526
  date: 2022-08-19_16-10-55
  done: true
  experiment_id: 82a02ba1cdd74513ad9b42d380f9593d
  experiment_tag: 7_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0015,max_num_steps=30.0000,num_workers=16,shuffle=True
  hostn

0,1
avg_loss,▂▁▁▅▅▆▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▇▆▇▆▆▆
iterations_since_restore,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,▂▁▁█▃▆▇▆▂▃▅▁▃▇▄▃▄▂▂▃▃▅▂▁█▂▅▂▄▂
time_since_restore,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
time_this_iter_s,█▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time_total_s,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
warmup_time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,70.75703
iterations_since_restore,30.0
loss,40.3019
time_since_restore,266.6503
time_this_iter_s,7.90225
time_total_s,266.6503
timestamp,1660925455.0
timesteps_since_restore,0.0
training_iteration,30.0
warmup_time,0.00551


[2m[36m(RayTrainWorker pid=23952)[0m 2022-08-19 16:11:18,287	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=23954)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=23954)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=23953)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=23953)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=23952)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=23952)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=23952)[0m 2022-08-19 16:11:27,344	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=23952)[0m 2022-08-19 16:11:27,346	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 0.0745643600821495, 'avg_loss': 0.0745643600821495}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000000)... 

Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 21.705281972885132
  _timestamp: 1660925501
  _training_iteration: 1
  avg_loss: 0.0745643600821495
  date: 2022-08-19_16-11-42
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 0.0745643600821495
  node_ip: 10.150.0.3
  pid: 23882
  should_checkpoint: true
  time_since_restore: 27.722006797790527
  time_this_iter_s: 27.722006797790527
  time_total_s: 27.722006797790527
  timestamp: 1660925502
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 56.8322868347168, 'avg_loss': 28.453425597399473}
Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.2081892490386963
  _timestamp: 1660925508
  _training_iteration: 3
  avg_loss: 20.219754584133625
  date: 2022-08-19_16-11-48
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 3
  loss: 3.7524125576019287
  node_ip: 10.150.0.3
  pid: 23882
  time_since_restore: 34.41284227371216
  time_this_iter_s: 3.1102631092071533
  time_total_s: 34.41284227371216
  timestamp: 1660925508
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 3.7524125576019287, 'avg_loss': 20.219754584133625}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 4.753835678100586, 'avg_loss': 16.353274857625365}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 6.51754903793335, 'avg_loss': 14.3861

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000001)... 

Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.2078893184661865
  _timestamp: 1660925515
  _training_iteration: 5
  avg_loss: 14.386129693686962
  date: 2022-08-19_16-11-55
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 5
  loss: 6.51754903793335
  node_ip: 10.150.0.3
  pid: 23882
  should_checkpoint: true
  time_since_restore: 41.10362005233765
  time_this_iter_s: 3.572185754776001
  time_total_s: 41.10362005233765
  timestamp: 1660925515
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 17.214265823364258, 'avg_loss': 14.857485715299845}
Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.5220329761505127
  _timestamp: 1660925522
  _training_iteration: 7
  avg_loss: 16.868711696139403
  date: 2022-08-19_16-12-02
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 7
  loss: 28.936067581176758
  node_ip: 10.150.0.3
  pid: 23882
  time_since_restore: 47.894137144088745
  time_this_iter_s: 3.4261016845703125
  time_total_s: 47.894137144088745
  timestamp: 1660925522
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 28.936067581176758, 'avg_loss': 16.868711696139403}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 21.230815887451172, 'avg_loss': 17.413974720053375}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 23.589385986328125, 'avg_loss': 

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000002)... 

Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.2713303565979004
  _timestamp: 1660925528
  _training_iteration: 9
  avg_loss: 18.100131527417236
  date: 2022-08-19_16-12-09
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 9
  loss: 23.589385986328125
  node_ip: 10.150.0.3
  pid: 23882
  should_checkpoint: true
  time_since_restore: 54.61567783355713
  time_this_iter_s: 3.655038356781006
  time_total_s: 54.61567783355713
  timestamp: 1660925529
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 317.7791748046875, 'avg_loss': 48.06803585514426}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 70.00452423095703, 'avg_loss': 50.06226207112724}
Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 2.9186415672302246
  _timestamp: 1660925535
  _training_iteration: 11
  avg_loss: 50.06226207112724
  date: 2022-08-19_16-12-15
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 11
  loss: 70.00452423095703
  node_ip: 10.150.0.3
  pid: 23882
  time_since_restore: 61.050337076187134
  time_this_iter_s: 3.003775119781494
  time_total_s: 61.050337076187134
  timestamp: 1660925535
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 38.47185516357422, 'avg_loss': 49.09639482883116}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 63.310630798339844, 'avg_loss': 50.189

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000003)... 

Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.2373406887054443
  _timestamp: 1660925541
  _training_iteration: 13
  avg_loss: 50.18979759571644
  date: 2022-08-19_16-12-22
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 13
  loss: 63.310630798339844
  node_ip: 10.150.0.3
  pid: 23882
  should_checkpoint: true
  time_since_restore: 67.6645040512085
  time_this_iter_s: 3.5167431831359863
  time_total_s: 67.6645040512085
  timestamp: 1660925542
  timesteps_since_restore: 0
  training_iteration: 13
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 44.6736946105957, 'avg_loss': 49.79579023963639}
Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.0091326236724854
  _timestamp: 1660925548
  _training_iteration: 15
  avg_loss: 46.801151539385316
  date: 2022-08-19_16-12-28
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 15
  loss: 4.876209735870361
  node_ip: 10.150.0.3
  pid: 23882
  time_since_restore: 73.9322338104248
  time_this_iter_s: 3.017648935317993
  time_total_s: 73.9322338104248
  timestamp: 1660925548
  timesteps_since_restore: 0
  training_iteration: 15
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 4.876209735870361, 'avg_loss': 46.801151539385316}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 217.91543579101562, 'avg_loss': 57.49579430511221}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 180.1636199951172, 'avg_loss': 64.711548

[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000004)... 

Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.193298816680908
  _timestamp: 1660925554
  _training_iteration: 17
  avg_loss: 64.71154875746545
  date: 2022-08-19_16-12-35
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 17
  loss: 180.1636199951172
  node_ip: 10.150.0.3
  pid: 23882
  should_checkpoint: true
  time_since_restore: 80.67390418052673
  time_this_iter_s: 3.5719199180603027
  time_total_s: 80.67390418052673
  timestamp: 1660925555
  timesteps_since_restore: 0
  training_iteration: 17
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 45.01848602294922, 'avg_loss': 63.61748971665899}
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 177.9381866455078, 'avg_loss': 69.63436850238787}
Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.0261824131011963
  _timestamp: 1660925561
  _training_iteration: 19
  avg_loss: 69.63436850238787
  date: 2022-08-19_16-12-41
  done: false
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  hostname: anish-l5-kit
  iterations_since_restore: 19
  loss: 177.9381866455078
  node_ip: 10.150.0.3
  pid: 23882
  time_since_restore: 87.13663482666016
  time_this_iter_s: 3.1093850135803223
  time_total_s: 87.13663482666016
  timestamp: 1660925561
  timesteps_since_restore: 0
  training_iteration: 19
  trial_id: e0b1c35e
  warmup_time: 0.006433725357055664
  
[2m[36m(RayTrainWorker pid=23952)[0m {'loss': 75.25300598144531, 'avg_loss': 69.91530037634075}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000005)... Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_e0b1c35e_8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,r_2022-08-19_16-11-08/checkpoint_000005)... Done. 0.2s


VBox(children=(Label(value='545.043 MB of 545.043 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▁▄▃▃▂▂▃▃▃▆▆▆▆▆▆▇▇▇██
iterations_since_restore,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,▁▂▁▁▁▁▂▁▂█▃▂▂▂▁▆▅▂▅▃
time_since_restore,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇██
time_this_iter_s,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
time_total_s,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇██
timestamp,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇██
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
warmup_time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,69.9153
iterations_since_restore,20.0
loss,75.25301
time_since_restore,90.84461
time_this_iter_s,3.70798
time_total_s,90.84461
timestamp,1660925565.0
timesteps_since_restore,0.0
training_iteration,20.0
warmup_time,0.00643


Result for TorchTrainer_e0b1c35e:
  _time_this_iter_s: 3.3496782779693604
  _timestamp: 1660925564
  _training_iteration: 20
  avg_loss: 69.91530037634075
  date: 2022-08-19_16-12-45
  done: true
  experiment_id: f55bc8a2bdde4e1f99c679644a19b189
  experiment_tag: 8_batch_size=9.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0014,max_num_steps=20.0000,num_workers=16,shuffle=True
  host

[2m[36m(RayTrainWorker pid=24884)[0m 2022-08-19 16:13:04,079	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=24884)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=24884)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=24884)[0m 2022-08-19 16:13:11,893	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=24884)[0m 2022-08-19 16:13:11,895	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=24885)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=24885)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=24886)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=24886)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "


[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 0.15588076412677765, 'avg_loss': 0.15588076412677765}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000000)... 

Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 27.17526388168335
  _timestamp: 1660925612
  _training_iteration: 1
  avg_loss: 0.15588076412677765
  date: 2022-08-19_16-13-33
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 0.15588076412677765
  node_ip: 10.150.0.3
  pid: 24811
  should_checkpoint: true
  time_since_restore: 33.90446496009827
  time_this_iter_s: 33.90446496009827
  time_total_s: 33.90446496009827
  timestamp: 1660925613
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  


Done. 1.1s


Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.783123731613159
  _timestamp: 1660925618
  _training_iteration: 2
  avg_loss: 17.428503431379795
  date: 2022-08-19_16-13-38
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 2
  loss: 34.70112609863281
  node_ip: 10.150.0.3
  pid: 24811
  time_since_restore: 39.20008397102356
  time_this_iter_s: 5.295619010925293
  time_total_s: 39.20008397102356
  timestamp: 1660925618
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 34.70112609863281, 'avg_loss': 17.428503431379795}
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 5.982594013214111, 'avg_loss': 13.613200291991234}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000001)... 

Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.180402994155884
  _timestamp: 1660925623
  _training_iteration: 3
  avg_loss: 13.613200291991234
  date: 2022-08-19_16-13-44
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 3
  loss: 5.982594013214111
  node_ip: 10.150.0.3
  pid: 24811
  should_checkpoint: true
  time_since_restore: 44.92548131942749
  time_this_iter_s: 5.725397348403931
  time_total_s: 44.92548131942749
  timestamp: 1660925624
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  


Done. 1.0s


Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.502869129180908
  _timestamp: 1660925629
  _training_iteration: 4
  avg_loss: 16.61726124957204
  date: 2022-08-19_16-13-49
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 4
  loss: 25.629444122314453
  node_ip: 10.150.0.3
  pid: 24811
  time_since_restore: 49.933568477630615
  time_this_iter_s: 5.008087158203125
  time_total_s: 49.933568477630615
  timestamp: 1660925629
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 25.629444122314453, 'avg_loss': 16.61726124957204}
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 31.665180206298828, 'avg_loss': 19.626845040917395}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000002)... 

Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.104624271392822
  _timestamp: 1660925634
  _training_iteration: 5
  avg_loss: 19.626845040917395
  date: 2022-08-19_16-13-54
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 5
  loss: 31.665180206298828
  node_ip: 10.150.0.3
  pid: 24811
  should_checkpoint: true
  time_since_restore: 55.44525647163391
  time_this_iter_s: 5.511687994003296
  time_total_s: 55.44525647163391
  timestamp: 1660925634
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  


Done. 1.0s


Result for TorchTrainer_892cedc4:
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 247.82144165039062, 'avg_loss': 57.65927780916294}
  _time_this_iter_s: 5.481173992156982
  _timestamp: 1660925640
  _training_iteration: 6
  avg_loss: 57.65927780916294
  date: 2022-08-19_16-14-00
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 6
  loss: 247.82144165039062
  node_ip: 10.150.0.3
  pid: 24811
  time_since_restore: 60.47847580909729
  time_this_iter_s: 5.033219337463379
  time_total_s: 60.47847580909729
  timestamp: 1660925640
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 55.60124588012695, 'avg_loss': 57.36527324787208}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000003)... 

Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.445033311843872
  _timestamp: 1660925645
  _training_iteration: 7
  avg_loss: 57.36527324787208
  date: 2022-08-19_16-14-05
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 7
  loss: 55.60124588012695
  node_ip: 10.150.0.3
  pid: 24811
  should_checkpoint: true
  time_since_restore: 66.37118744850159
  time_this_iter_s: 5.892711639404297
  time_total_s: 66.37118744850159
  timestamp: 1660925645
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  


Done. 1.1s


Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.526931047439575
  _timestamp: 1660925650
  _training_iteration: 8
  avg_loss: 54.62261692620814
  date: 2022-08-19_16-14-10
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 8
  loss: 35.42402267456055
  node_ip: 10.150.0.3
  pid: 24811
  time_since_restore: 71.4339644908905
  time_this_iter_s: 5.062777042388916
  time_total_s: 71.4339644908905
  timestamp: 1660925650
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 35.42402267456055, 'avg_loss': 54.62261692620814}
[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 23.427366256713867, 'avg_loss': 51.156477962931}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000004)... 

Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 4.954676389694214
  _timestamp: 1660925655
  _training_iteration: 9
  avg_loss: 51.156477962931
  date: 2022-08-19_16-14-16
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 9
  loss: 23.427366256713867
  node_ip: 10.150.0.3
  pid: 24811
  should_checkpoint: true
  time_since_restore: 76.92026042938232
  time_this_iter_s: 5.486295938491821
  time_total_s: 76.92026042938232
  timestamp: 1660925656
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=24884)[0m {'loss': 240.05776977539062, 'avg_loss': 70.04660714417696}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000005)... 

Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.844266176223755
  _timestamp: 1660925661
  _training_iteration: 10
  avg_loss: 70.04660714417696
  date: 2022-08-19_16-14-22
  done: false
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  hostname: anish-l5-kit
  iterations_since_restore: 10
  loss: 240.05776977539062
  node_ip: 10.150.0.3
  pid: 24811
  should_checkpoint: true
  time_since_restore: 82.71842885017395
  time_this_iter_s: 5.798168420791626
  time_total_s: 82.71842885017395
  timestamp: 1660925662
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: 892cedc4
  warmup_time: 0.004983186721801758
  


Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_892cedc4_9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,_2022-08-19_16-12-56/checkpoint_000005)... Done. 0.3s


VBox(children=(Label(value='545.043 MB of 545.043 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▁▃▂▃▃▇▇▆▆█
iterations_since_restore,▁▂▃▃▄▅▆▆▇█
loss,▁▂▁▂▂█▃▂▂█
time_since_restore,▁▂▃▃▄▅▆▆▇█
time_this_iter_s,█▁▁▁▁▁▁▁▁▁
time_total_s,▁▂▃▃▄▅▆▆▇█
timestamp,▁▂▃▃▄▅▆▆▇█
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▂▃▃▄▅▆▆▇█
warmup_time,▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,70.04661
iterations_since_restore,10.0
loss,240.05777
time_since_restore,82.71843
time_this_iter_s,5.79817
time_total_s,82.71843
timestamp,1660925662.0
timesteps_since_restore,0.0
training_iteration,10.0
warmup_time,0.00498


Result for TorchTrainer_892cedc4:
  _time_this_iter_s: 5.844266176223755
  _timestamp: 1660925661
  _training_iteration: 10
  avg_loss: 70.04660714417696
  date: 2022-08-19_16-14-22
  done: true
  experiment_id: d0d0d560b8014ef3bacf51cf0b6bcd05
  experiment_tag: 9_batch_size=15.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0020,max_num_steps=10.0000,num_workers=16,shuffle=True
  host

[2m[36m(RayTrainWorker pid=25956)[0m 2022-08-19 16:14:38,489	INFO config.py:72 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=25957)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=25957)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=25958)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=25958)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=25956)[0m   cpuset_checked))
[2m[36m(RayTrainWorker pid=25956)[0m   f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
[2m[36m(RayTrainWorker pid=25956)[0m 2022-08-19 16:14:47,898	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=25956)[0m 2022-08-19 16:14:47,900	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 23.291645050048828, 'avg_loss': 23.291645050048828}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000000)... 

Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 36.42748141288757
  _timestamp: 1660925716
  _training_iteration: 1
  avg_loss: 23.291645050048828
  date: 2022-08-19_16-15-17
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 1
  loss: 23.291645050048828
  node_ip: 10.150.0.3
  pid: 25888
  should_checkpoint: true
  time_since_restore: 42.61508226394653
  time_this_iter_s: 42.61508226394653
  time_total_s: 42.61508226394653
  timestamp: 1660925717
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  


Done. 1.1s


[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 8.079105377197266, 'avg_loss': 15.685375213623047}
Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 8.261768817901611
  _timestamp: 1660925724
  _training_iteration: 2
  avg_loss: 15.685375213623047
  date: 2022-08-19_16-15-24
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 2
  loss: 8.079105377197266
  node_ip: 10.150.0.3
  pid: 25888
  time_since_restore: 50.374101638793945
  time_this_iter_s: 7.759019374847412
  time_total_s: 50.374101638793945
  timestamp: 1660925724
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  
[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 11.271661758422852, 'avg_loss': 14.214137395222982}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000001)... 

Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 7.724950551986694
  _timestamp: 1660925732
  _training_iteration: 3
  avg_loss: 14.214137395222982
  date: 2022-08-19_16-15-33
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 3
  loss: 11.271661758422852
  node_ip: 10.150.0.3
  pid: 25888
  should_checkpoint: true
  time_since_restore: 58.56332540512085
  time_this_iter_s: 8.189223766326904
  time_total_s: 58.56332540512085
  timestamp: 1660925733
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  


Done. 1.1s


Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 8.115289449691772
  _timestamp: 1660925740
  _training_iteration: 4
  avg_loss: 53.23197603225708
  date: 2022-08-19_16-15-40
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 4
  loss: 170.28549194335938
  node_ip: 10.150.0.3
  pid: 25888
  time_since_restore: 66.11527276039124
  time_this_iter_s: 7.551947355270386
  time_total_s: 66.11527276039124
  timestamp: 1660925740
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  
[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 170.28549194335938, 'avg_loss': 53.23197603225708}


[2m[36m(RayTrainWorker pid=25957)[0m E0819 16:15:47.340480127   26018 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"
[2m[36m(RayTrainWorker pid=25958)[0m E0819 16:15:47.472209301   26043 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"
[2m[36m(RayTrainWorker pid=25956)[0m E0819 16:15:48.006606789   26016 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 61.7700080871582, 'avg_loss': 54.939582443237306}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000002)... 

Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 7.705145359039307
  _timestamp: 1660925748
  _training_iteration: 5
  avg_loss: 54.939582443237306
  date: 2022-08-19_16-15-48
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 5
  loss: 61.7700080871582
  node_ip: 10.150.0.3
  pid: 25888
  should_checkpoint: true
  time_since_restore: 74.234046459198
  time_this_iter_s: 8.118773698806763
  time_total_s: 74.234046459198
  timestamp: 1660925748
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  


Done. 1.1s


Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 8.309581995010376
  _timestamp: 1660925756
  _training_iteration: 6
  avg_loss: 63.538872718811035
  date: 2022-08-19_16-15-56
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 6
  loss: 106.53532409667969
  node_ip: 10.150.0.3
  pid: 25888
  time_since_restore: 82.13007926940918
  time_this_iter_s: 7.896032810211182
  time_total_s: 82.13007926940918
  timestamp: 1660925756
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  
[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 106.53532409667969, 'avg_loss': 63.538872718811035}
[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 162.86679077148438, 'avg_loss': 77.72857529776437}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000003)... 

Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 8.185299634933472
  _timestamp: 1660925764
  _training_iteration: 7
  avg_loss: 77.72857529776437
  date: 2022-08-19_16-16-05
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 7
  loss: 162.86679077148438
  node_ip: 10.150.0.3
  pid: 25888
  should_checkpoint: true
  time_since_restore: 90.77018618583679
  time_this_iter_s: 8.640106916427612
  time_total_s: 90.77018618583679
  timestamp: 1660925765
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 232.46937561035156, 'avg_loss': 97.07117533683777}
Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 7.895900011062622
  _timestamp: 1660925772
  _training_iteration: 8
  avg_loss: 97.07117533683777
  date: 2022-08-19_16-16-12
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 8
  loss: 232.46937561035156
  node_ip: 10.150.0.3
  pid: 25888
  time_since_restore: 98.2962417602539
  time_this_iter_s: 7.526055574417114
  time_total_s: 98.2962417602539
  timestamp: 1660925772
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  
[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 21.60836410522461, 'avg_loss': 88.6864185333252}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000004)... 

Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 7.551017999649048
  _timestamp: 1660925780
  _training_iteration: 9
  avg_loss: 88.6864185333252
  date: 2022-08-19_16-16-20
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 9
  loss: 21.60836410522461
  node_ip: 10.150.0.3
  pid: 25888
  should_checkpoint: true
  time_since_restore: 106.22802925109863
  time_this_iter_s: 7.931787490844727
  time_total_s: 106.22802925109863
  timestamp: 1660925780
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  


Done. 1.0s


[2m[36m(RayTrainWorker pid=25956)[0m {'loss': 36.51075744628906, 'avg_loss': 83.46885242462159}


[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000005)... 

Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 8.360788345336914
  _timestamp: 1660925788
  _training_iteration: 10
  avg_loss: 83.46885242462159
  date: 2022-08-19_16-16-29
  done: false
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  hostname: anish-l5-kit
  iterations_since_restore: 10
  loss: 36.51075744628906
  node_ip: 10.150.0.3
  pid: 25888
  should_checkpoint: true
  time_since_restore: 114.68820333480835
  time_this_iter_s: 8.460174083709717
  time_total_s: 114.68820333480835
  timestamp: 1660925789
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: c9513f7c
  warmup_time: 0.005364418029785156
  


Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (/home/jupyter/ray_results/TorchTrainer_2022-08-19_15-45-58/TorchTrainer_c9513f7c_10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50_2022-08-19_16-14-31/checkpoint_000005)... Done. 0.3s


Result for TorchTrainer_c9513f7c:
  _time_this_iter_s: 8.360788345336914
  _timestamp: 1660925788
  _training_iteration: 10
  avg_loss: 83.46885242462159
  date: 2022-08-19_16-16-29
  done: true
  experiment_id: 33253c1dec9943b3abbfa53eda24348c
  experiment_tag: 10_batch_size=24.0000,format_version=4,future_num_frames=50,history_num_frames=0,model_architecture=resnet50,render_ego_history=True,step_time=0.1000,dataset_meta_key=meta_json,disable_traffic_light_faces=False,ego_center=0_25_0_5,filter_agents_threshold=0.5000,map_type=py_semantic,pixel_size=0_5_0_5,raster_size=224_224,satellite_map_key=aerial_map_aerial_map_png,semantic_map_key=semantic_map_semantic_map_pb,set_origin_to_bottom=True,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=True,checkpoint_every_n_steps=10000,eval_every_n_steps=10000,max_num_steps=5,batch_size=12,key=scenes_sample_zarr,num_workers=16,shuffle=False,dataset_key=scenes_sample_zarr,lr=0.0017,max_num_steps=10.0000,num_workers=16,shuffle=True
  hos

2022-08-19 16:16:39,717	INFO tune.py:759 -- Total run time: 1841.64 seconds (1841.45 seconds for the tuning loop).


VBox(children=(Label(value='545.068 MB of 545.068 MB uploaded (90.796 MB deduped)\r'), FloatProgress(value=1.0…

0,1
avg_loss,▂▁▁▄▄▅▆█▇▇
iterations_since_restore,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▆▃▄▆█▁▂
time_since_restore,▁▂▃▃▄▅▆▆▇█
time_this_iter_s,█▁▁▁▁▁▁▁▁▁
time_total_s,▁▂▃▃▄▅▆▆▇█
timestamp,▁▂▃▃▄▅▆▆▇█
timesteps_since_restore,▁▁▁▁▁▁▁▁▁▁
training_iteration,▁▂▃▃▄▅▆▆▇█
warmup_time,▁▁▁▁▁▁▁▁▁▁

0,1
avg_loss,83.46885
iterations_since_restore,10.0
loss,36.51076
time_since_restore,114.6882
time_this_iter_s,8.46017
time_total_s,114.6882
timestamp,1660925789.0
timesteps_since_restore,0.0
training_iteration,10.0
warmup_time,0.00536


In [34]:
import time

In [35]:
#does sleeping help???
time.sleep(30)

In [32]:
analysis_df = analysis.get_dataframe()

In [33]:
analysis_df

Unnamed: 0,loss,avg_loss,_timestamp,_time_this_iter_s,_training_iteration,time_this_iter_s,should_checkpoint,done,timesteps_total,episodes_total,...,config/train_loop_config/cfg/train_data_loader/num_workers,config/train_loop_config/cfg/train_data_loader/shuffle,config/train_loop_config/cfg/train_params/checkpoint_every_n_steps,config/train_loop_config/cfg/train_params/eval_every_n_steps,config/train_loop_config/cfg/train_params/max_num_steps,config/train_loop_config/cfg/val_data_loader/batch_size,config/train_loop_config/cfg/val_data_loader/key,config/train_loop_config/cfg/val_data_loader/num_workers,config/train_loop_config/cfg/val_data_loader/shuffle,logdir
0,41.008888,94.537867,1660924095,3.218059,35,3.654188,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
1,109.761116,67.864799,1660924236,4.230319,25,4.711835,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
2,41.149384,70.176504,1660924618,7.601075,45,8.216519,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
3,94.941132,92.686515,1660924787,3.154223,40,3.595659,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
4,106.942398,82.296616,1660924977,5.03687,30,5.481915,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
5,24.701918,90.69437,1660925174,3.202257,50,3.554569,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
6,40.301899,70.757032,1660925455,7.444295,30,7.902245,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
7,75.253006,69.9153,1660925564,3.349678,20,3.707979,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
8,240.05777,70.046607,1660925661,5.844266,10,5.798168,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...
9,36.510757,83.468852,1660925788,8.360788,10,8.460174,True,False,,,...,16,True,10000,10000,5,12,scenes/sample.zarr,16,False,/home/jupyter/ray_results/TorchTrainer_2022-08...


In [36]:
# analysis.get_best_result().metrics
# best_result = 

In [37]:
# Run information
wandb_entity = "l5-demo"
project_name = "l5-prediction"
run_name = "train-prediction-model"
run_type = "train"
run_description = """
Train prediction model
"""
tags = ["train", "prediction"]

In [38]:
run = wandb.init(
    entity=wandb_entity,
    project=project_name,
    job_type=run_type,
    name=run_name,
    notes=run_description,
    tags=tags,
    config=cfg
)

In [None]:
#BUG: to force a connection on the lineage graph
artifact = run.use_artifact(f"{artifact_entity}/{artifact_project}/{artifact_name}:{artifact_alias}", type=artifact_type)

In [39]:
analysis_table = wandb.Table(dataframe=analysis_df)

In [40]:
#BUG: run gets lost after tune job due to change in cwd. Forced to make 2 runs
if len(analysis_table.data) == 0:
    raise ValueError("bad table for some reason")
else:
    run.log({"analysis_table": analysis_table})
    run.finish()

VBox(children=(Label(value='0.018 MB of 0.028 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.636284…