In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
model_name = "EleutherAI/gpt-j-6B"
use_gpu = True
num_workers = 2
cpus_per_worker = 4

In [3]:
import ray

ray.init(
    runtime_env={
        "pip": [
            "datasets",
            "evaluate",
            # Latest combination of accelerate==0.19.0 and transformers==4.29.0
            # seems to have issues with DeepSpeed process group initialization,
            # and will result in a batch_size validation problem.
            # TODO(jungong) : get rid of the pins once the issue is fixed.
            "accelerate==0.16.0",
            "transformers==4.26.0",
            "torch>=1.12.0",
            "deepspeed==0.12.3",
        ],
    },
)

2024-01-24 09:33:48,733	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.9.1


In [4]:
from datasets import load_dataset

print("Loading tiny_shakespeare dataset")
current_dataset = load_dataset("tiny_shakespeare")
current_dataset

Loading tiny_shakespeare dataset


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [5]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(current_dataset["train"]),
    "validation": ray.data.from_huggingface(current_dataset["validation"])
}

ray_datasets

{'train': MaterializedDataset(num_blocks=1, num_rows=1, schema={text: string}),
 'validation': MaterializedDataset(num_blocks=1, num_rows=1, schema={text: string})}

In [6]:
block_size = 512

In [7]:
from transformers import AutoTokenizer

def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=block_size,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)

processed_datasets = {
    key: ds.map_batches(split_text, batch_format="pandas").map_batches(tokenize, batch_format="pandas").random_shuffle(seed=42)
    for key, ds in ray_datasets.items()
}
processed_datasets

{'train': RandomShuffle
 +- MapBatches(tokenize)
    +- MapBatches(split_text)
       +- Dataset(num_blocks=1, num_rows=1, schema={text: string}),
 'validation': RandomShuffle
 +- MapBatches(tokenize)
    +- MapBatches(split_text)
       +- Dataset(num_blocks=1, num_rows=1, schema={text: string})}

In [8]:
import evaluate
import torch
from transformers import (
    Trainer,
    TrainingArguments,
    GPTJForCausalLM,
    AutoTokenizer,
    default_data_collator,
)
from transformers.utils.logging import disable_progress_bar, enable_progress_bar

from ray import train
from ray.train.huggingface.transformers import (
    prepare_trainer,
    RayTrainReportCallback
)


def train_func(config):
    # Use the actual number of CPUs assigned by Ray
    os.environ["OMP_NUM_THREADS"] = str(
        train.get_context().get_trial_resources().bundles[-1].get("CPU", 1)
    )
    # Enable tf32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True

    batch_size = config.get("batch_size", 4)
    epochs = config.get("epochs", 2)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)
    steps_per_epoch = config.get("steps_per_epoch")

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
        },
        "bf16": {"enabled": "auto"},
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }

    print("Preparing training arguments")
    training_args = TrainingArguments(
        "output",
        logging_steps=1,
        save_strategy="steps",
        save_steps=steps_per_epoch,
        max_steps=steps_per_epoch * epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        label_names=["input_ids", "attention_mask"],
        push_to_hub=False,
        report_to="none",
        disable_tqdm=True,  # declutter the output a little
        fp16=True,
        gradient_checkpointing=True,
        deepspeed=deepspeed,
    )
    disable_progress_bar()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model")

    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)
    model.resize_token_embeddings(len(tokenizer))

    print("Model loaded")

    enable_progress_bar()

    metric = evaluate.load("accuracy")

    train_ds = train.get_dataset_shard("train")
    eval_ds = train.get_dataset_shard("validation")

    train_ds_iterable = train_ds.iter_torch_batches(batch_size=batch_size)
    eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    # Add callback to report checkpoints to Ray Train
    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()

In [9]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig

batch_size = 16
train_ds_size = processed_datasets["train"].count()
steps_per_epoch = train_ds_size // (batch_size * num_workers)

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={
        "epochs": 1,
        "batch_size": batch_size,  # per device
        "steps_per_epoch": steps_per_epoch
    },
    scaling_config=ScalingConfig(
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker={"GPU": 1, "CPU": cpus_per_worker},
    ),
    datasets=processed_datasets,
    run_config=RunConfig(storage_path="/models"),
)

2024-01-24 09:34:11,896	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle]
2024-01-24 09:34:11,898	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2024-01-24 09:34:11,901	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle 1:   0%|          | 0/1 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/1 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/1 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
results = trainer.fit()

0,1
Current time:,2024-01-24 09:42:44
Running for:,00:08:19.23
Memory:,185.0/2015.5 GiB

Trial name,status,loc
TorchTrainer_c3062_00000,RUNNING,192.168.64.2:266117


[36m(TrainTrainable pid=266117)[0m [2024-01-24 09:34:31,353] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[36m(RayTrainWorker pid=266237)[0m Setting up process group for: env:// [rank=0, world_size=2]
[36m(TorchTrainer pid=266117)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=266117)[0m - (ip=192.168.64.2, pid=266237) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=266117)[0m - (ip=192.168.64.2, pid=266238) world_rank=1, local_rank=1, node_rank=0


[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:34:40,446] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[36m(RayTrainWorker pid=266238)[0m [2024-01-24 09:34:40,440] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[36m(SplitCoordinator pid=266453)[0m Auto configuring locality_with_output=['f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb', 'f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb']


[36m(RayTrainWorker pid=266237)[0m Preparing training arguments
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:34:41,633] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=266237)[0m Loading model
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:04,380] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 285, num_elems = 6.05B
[36m(RayTrainWorker pid=266238)[0m Preparing training arguments
[36m(RayTrainWorker pid=266238)[0m [2024-01-24 09:34:41,632] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=266238)[0m Loading model
[36m(RayTrainWorker pid=266237)[0m Model loaded
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:27,260] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.3, git-hash=unknown, git-branch=unknown
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:27,280] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False


[36m(RayTrainWorker pid=266237)[0m max_steps is given, it will override any value given in num_train_epochs
[36m(RayTrainWorker pid=266237)[0m Using cuda_amp half precision backend
[36m(SplitCoordinator pid=266454)[0m Auto configuring locality_with_output=['f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb', 'f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb']
[36m(RayTrainWorker pid=266237)[0m Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
[36m(RayTrainWorker pid=266237)[0m Creating extension directory /root/.cache/torch_extensions/py310_cu118/cpu_adam...
[36m(RayTrainWorker pid=266237)[0m Detected CUDA files, patching ldflags
[36m(RayTrainWorker pid=266237)[0m Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
[36m(RayTrainWorker pid=266237)[0m Building extension module cpu_adam...
[36m(RayTrainWorker pid=266237)[0m Allowing ninja to set a default number of workers... (overridable 

[36m(RayTrainWorker pid=266237)[0m [1/4] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUD

[36m(RayTrainWorker pid=266237)[0m Loading extension module cpu_adam...
[36m(RayTrainWorker pid=266238)[0m Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...


[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,356] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,356] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [logging.py:96:log_dist

[36m(RayTrainWorker pid=266237)[0m ***** Running training *****
[36m(RayTrainWorker pid=266237)[0m   Num examples = 10784
[36m(RayTrainWorker pid=266237)[0m   Num Epochs = 9223372036854775807
[36m(RayTrainWorker pid=266237)[0m   Instantaneous batch size per device = 8
[36m(RayTrainWorker pid=266237)[0m   Total train batch size (w. parallel, distributed & accumulation) = 16
[36m(RayTrainWorker pid=266237)[0m   Gradient Accumulation steps = 1
[36m(RayTrainWorker pid=266237)[0m   Total optimization steps = 674
[36m(RayTrainWorker pid=266237)[0m   Number of trainable parameters = 0
[36m(RayTrainWorker pid=266238)[0m Loading extension module cpu_adam...


(pid=266453) - MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle 1:   0%|          | 0/1 [00:00<?, ?…

(pid=266453) Shuffle Map 2:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=266453) Shuffle Reduce 3:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=266453) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=266453)[0m Executing DAG InputDataBuffer[Input] -> AllToAllOperator[MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle] -> OutputSplitter[split(2, equal=True)]
[36m(SplitCoordinator pid=266453)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=9.0, gpu=2.0, object_store_memory=0.0), locality_with_output=['f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb', 'f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=266453)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[36m(RayTrainWorker pid=266237)[0m   total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])


[36m(RayTrainWorker pid=266237)[0m {'loss': 12.2031, 'learning_rate': 1.997032640949555e-05, 'epoch': 0.0}




[36m(RayTrainWorker pid=266237)[0m {'loss': 6.9883, 'learning_rate': 1.99406528189911e-05, 'epoch': 0.0}


[36m(RayTrainWorker pid=266238)[0m   total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])


[36m(RayTrainWorker pid=266237)[0m {'loss': 2.709, 'learning_rate': 1.991097922848665e-05, 'epoch': 0.0}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1581, 'learning_rate': 1.9881305637982196e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1187, 'learning_rate': 1.9851632047477747e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1158, 'learning_rate': 1.9821958456973295e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1129, 'learning_rate': 1.9792284866468846e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1031, 'learning_rate': 1.9762611275964394e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1045, 'learning_rate': 1.9732937685459942e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:38:32,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[1.9703264094955493e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:38:32,910] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=1.4892674986990257, CurrSamplesPerSec=1.4950116646745464, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0934, 'learning_rate': 1.9703264094955493e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0926, 'learning_rate': 1.967359050445104e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.091, 'learning_rate': 1.964391691394659e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0989, 'learning_rate': 1.9614243323442137e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0923, 'learning_rate': 1.9584569732937684e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0924, 'learning_rate': 1.9554896142433236e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0875, 'learning_rate': 1.9525222551928784e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0832, 'learning_rate': 1.9495548961424335e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0792, 'learning_rate': 1.9465875370919883e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0904, 'learning_rate': 1.943620178041543e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:40:24,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[1.940652818991098e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:40:24,201] [INFO] [timer.py:260:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=1.4606751628215193, CurrSamplesPerSec=1.4091414575825598, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0873, 'learning_rate': 1.940652818991098e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0904, 'learning_rate': 1.937685459940653e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0865, 'learning_rate': 1.9347181008902077e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.086, 'learning_rate': 1.931750741839763e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0857, 'learning_rate': 1.9287833827893176e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0874, 'learning_rate': 1.9258160237388724e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0836, 'learning_rate': 1.9228486646884275e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0862, 'learning_rate': 1.9198813056379823e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0844, 'learning_rate': 1.9169139465875374e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0846, 'learning_rate': 1.9139465875370922e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:42:20,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[1.910979228486647e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:42:20,855] [INFO] [timer.py:260:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=1.42790700193431, CurrSamplesPerSec=1.41897354735648, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0875, 'learning_rate': 1.910979228486647e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0824, 'learning_rate': 1.908011869436202e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0842, 'learning_rate': 1.905044510385757e-05, 'epoch': 0.05}


