In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
model_name = "EleutherAI/gpt-j-6B"
use_gpu = True
num_workers = 2
cpus_per_worker = 4

In [3]:
import ray

ray.init(
    runtime_env={
        "pip": [
            "datasets",
            "evaluate",
            # Latest combination of accelerate==0.19.0 and transformers==4.29.0
            # seems to have issues with DeepSpeed process group initialization,
            # and will result in a batch_size validation problem.
            # TODO(jungong) : get rid of the pins once the issue is fixed.
            "accelerate==0.16.0",
            "transformers==4.26.0",
            "torch>=1.12.0",
            "deepspeed==0.12.3",
        ],
    },
)

2024-01-24 09:33:48,733	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.9.1


In [4]:
from datasets import load_dataset

print("Loading tiny_shakespeare dataset")
current_dataset = load_dataset("tiny_shakespeare")
current_dataset

Loading tiny_shakespeare dataset


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [5]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(current_dataset["train"]),
    "validation": ray.data.from_huggingface(current_dataset["validation"])
}

ray_datasets

{'train': MaterializedDataset(num_blocks=1, num_rows=1, schema={text: string}),
 'validation': MaterializedDataset(num_blocks=1, num_rows=1, schema={text: string})}

In [6]:
block_size = 512

In [7]:
from transformers import AutoTokenizer

def split_text(batch: pd.DataFrame) -> pd.DataFrame:
    text = list(batch["text"])
    flat_text = "".join(text)
    split_text = [
        x.strip()
        for x in flat_text.split("\n")
        if x.strip() and not x.strip()[-1] == ":"
    ]
    return pd.DataFrame(split_text, columns=["text"])


def tokenize(batch: pd.DataFrame) -> dict:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    ret = tokenizer(
        list(batch["text"]),
        truncation=True,
        max_length=block_size,
        padding="max_length",
        return_tensors="np",
    )
    ret["labels"] = ret["input_ids"].copy()
    return dict(ret)

processed_datasets = {
    key: ds.map_batches(split_text, batch_format="pandas").map_batches(tokenize, batch_format="pandas").random_shuffle(seed=42)
    for key, ds in ray_datasets.items()
}
processed_datasets

{'train': RandomShuffle
 +- MapBatches(tokenize)
    +- MapBatches(split_text)
       +- Dataset(num_blocks=1, num_rows=1, schema={text: string}),
 'validation': RandomShuffle
 +- MapBatches(tokenize)
    +- MapBatches(split_text)
       +- Dataset(num_blocks=1, num_rows=1, schema={text: string})}

In [8]:
import evaluate
import torch
from transformers import (
    Trainer,
    TrainingArguments,
    GPTJForCausalLM,
    AutoTokenizer,
    default_data_collator,
)
from transformers.utils.logging import disable_progress_bar, enable_progress_bar

from ray import train
from ray.train.huggingface.transformers import (
    prepare_trainer,
    RayTrainReportCallback
)


def train_func(config):
    # Use the actual number of CPUs assigned by Ray
    os.environ["OMP_NUM_THREADS"] = str(
        train.get_context().get_trial_resources().bundles[-1].get("CPU", 1)
    )
    # Enable tf32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True

    batch_size = config.get("batch_size", 4)
    epochs = config.get("epochs", 2)
    warmup_steps = config.get("warmup_steps", 0)
    learning_rate = config.get("learning_rate", 0.00002)
    weight_decay = config.get("weight_decay", 0.01)
    steps_per_epoch = config.get("steps_per_epoch")

    deepspeed = {
        "fp16": {
            "enabled": "auto",
            "initial_scale_power": 8,
        },
        "bf16": {"enabled": "auto"},
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True,
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True,
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "gather_16bit_weights_on_model_save": True,
            "round_robin_gradients": True,
        },
        "gradient_accumulation_steps": "auto",
        "gradient_clipping": "auto",
        "steps_per_print": 10,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "wall_clock_breakdown": False,
    }

    print("Preparing training arguments")
    training_args = TrainingArguments(
        "output",
        logging_steps=1,
        save_strategy="steps",
        save_steps=steps_per_epoch,
        max_steps=steps_per_epoch * epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        label_names=["input_ids", "attention_mask"],
        push_to_hub=False,
        report_to="none",
        disable_tqdm=True,  # declutter the output a little
        fp16=True,
        gradient_checkpointing=True,
        deepspeed=deepspeed,
    )
    disable_progress_bar()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading model")

    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)
    model.resize_token_embeddings(len(tokenizer))

    print("Model loaded")

    enable_progress_bar()

    metric = evaluate.load("accuracy")

    train_ds = train.get_dataset_shard("train")
    eval_ds = train.get_dataset_shard("validation")

    train_ds_iterable = train_ds.iter_torch_batches(batch_size=batch_size)
    eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    # Add callback to report checkpoints to Ray Train
    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()

In [9]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig

batch_size = 16
train_ds_size = processed_datasets["train"].count()
steps_per_epoch = train_ds_size // (batch_size * num_workers)

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={
        "epochs": 1,
        "batch_size": batch_size,  # per device
        "steps_per_epoch": steps_per_epoch
    },
    scaling_config=ScalingConfig(
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker={"GPU": 1, "CPU": cpus_per_worker},
    ),
    datasets=processed_datasets,
    run_config=RunConfig(storage_path="/models"),
)

2024-01-24 09:34:11,896	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle]
2024-01-24 09:34:11,898	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2024-01-24 09:34:11,901	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle 1:   0%|          | 0/1 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/1 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/1 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
results = trainer.fit()

0,1
Current time:,2024-01-24 11:56:20
Running for:,02:21:55.36
Memory:,44.8/2015.5 GiB

Trial name,status,loc,iter,total time (s),loss,learning_rate,epoch
TorchTrainer_c3062_00000,TERMINATED,192.168.64.2:266117,1,8277.92,0.0658,5.93472e-08,1


[36m(TrainTrainable pid=266117)[0m [2024-01-24 09:34:31,353] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[36m(RayTrainWorker pid=266237)[0m Setting up process group for: env:// [rank=0, world_size=2]
[36m(TorchTrainer pid=266117)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=266117)[0m - (ip=192.168.64.2, pid=266237) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=266117)[0m - (ip=192.168.64.2, pid=266238) world_rank=1, local_rank=1, node_rank=0


[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:34:40,446] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[36m(RayTrainWorker pid=266238)[0m [2024-01-24 09:34:40,440] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[36m(SplitCoordinator pid=266453)[0m Auto configuring locality_with_output=['f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb', 'f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb']


[36m(RayTrainWorker pid=266237)[0m Preparing training arguments
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:34:41,633] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=266237)[0m Loading model
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:04,380] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 285, num_elems = 6.05B
[36m(RayTrainWorker pid=266238)[0m Preparing training arguments
[36m(RayTrainWorker pid=266238)[0m [2024-01-24 09:34:41,632] [INFO] [comm.py:637:init_distributed] cdb=None
[36m(RayTrainWorker pid=266238)[0m Loading model
[36m(RayTrainWorker pid=266237)[0m Model loaded
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:27,260] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.12.3, git-hash=unknown, git-branch=unknown
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:27,280] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False


[36m(RayTrainWorker pid=266237)[0m max_steps is given, it will override any value given in num_train_epochs
[36m(RayTrainWorker pid=266237)[0m Using cuda_amp half precision backend
[36m(SplitCoordinator pid=266454)[0m Auto configuring locality_with_output=['f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb', 'f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb']
[36m(RayTrainWorker pid=266237)[0m Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
[36m(RayTrainWorker pid=266237)[0m Creating extension directory /root/.cache/torch_extensions/py310_cu118/cpu_adam...
[36m(RayTrainWorker pid=266237)[0m Detected CUDA files, patching ldflags
[36m(RayTrainWorker pid=266237)[0m Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
[36m(RayTrainWorker pid=266237)[0m Building extension module cpu_adam...
[36m(RayTrainWorker pid=266237)[0m Allowing ninja to set a default number of workers... (overridable 

[36m(RayTrainWorker pid=266237)[0m [1/4] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUD

[36m(RayTrainWorker pid=266237)[0m Loading extension module cpu_adam...
[36m(RayTrainWorker pid=266238)[0m Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...


[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,356] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,356] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:35:56,386] [INFO] [logging.py:96:log_dist

[36m(RayTrainWorker pid=266237)[0m ***** Running training *****
[36m(RayTrainWorker pid=266237)[0m   Num examples = 10784
[36m(RayTrainWorker pid=266237)[0m   Num Epochs = 9223372036854775807
[36m(RayTrainWorker pid=266237)[0m   Instantaneous batch size per device = 8
[36m(RayTrainWorker pid=266237)[0m   Total train batch size (w. parallel, distributed & accumulation) = 16
[36m(RayTrainWorker pid=266237)[0m   Gradient Accumulation steps = 1
[36m(RayTrainWorker pid=266237)[0m   Total optimization steps = 674
[36m(RayTrainWorker pid=266237)[0m   Number of trainable parameters = 0
[36m(RayTrainWorker pid=266238)[0m Loading extension module cpu_adam...


(pid=266453) - MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle 1:   0%|          | 0/1 [00:00<?, ?…

(pid=266453) Shuffle Map 2:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=266453) Shuffle Reduce 3:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=266453) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[36m(SplitCoordinator pid=266453)[0m Executing DAG InputDataBuffer[Input] -> AllToAllOperator[MapBatches(split_text)->MapBatches(tokenize)->RandomShuffle] -> OutputSplitter[split(2, equal=True)]
[36m(SplitCoordinator pid=266453)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=9.0, gpu=2.0, object_store_memory=0.0), locality_with_output=['f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb', 'f04480c4dea75141f1a311fae660b2c4899ab20f4cce2c91bfa4affb'], preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[36m(SplitCoordinator pid=266453)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[36m(RayTrainWorker pid=266237)[0m   total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])


[36m(RayTrainWorker pid=266237)[0m {'loss': 12.2031, 'learning_rate': 1.997032640949555e-05, 'epoch': 0.0}




[36m(RayTrainWorker pid=266237)[0m {'loss': 6.9883, 'learning_rate': 1.99406528189911e-05, 'epoch': 0.0}


[36m(RayTrainWorker pid=266238)[0m   total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])


[36m(RayTrainWorker pid=266237)[0m {'loss': 2.709, 'learning_rate': 1.991097922848665e-05, 'epoch': 0.0}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1581, 'learning_rate': 1.9881305637982196e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1187, 'learning_rate': 1.9851632047477747e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1158, 'learning_rate': 1.9821958456973295e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1129, 'learning_rate': 1.9792284866468846e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1031, 'learning_rate': 1.9762611275964394e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1045, 'learning_rate': 1.9732937685459942e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:38:32,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[1.9703264094955493e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:38:32,910] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=1.4892674986990257, CurrSamplesPerSec=1.4950116646745464, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0934, 'learning_rate': 1.9703264094955493e-05, 'epoch': 0.01}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0926, 'learning_rate': 1.967359050445104e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.091, 'learning_rate': 1.964391691394659e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0989, 'learning_rate': 1.9614243323442137e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0923, 'learning_rate': 1.9584569732937684e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0924, 'learning_rate': 1.9554896142433236e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0875, 'learning_rate': 1.9525222551928784e-05, 'epoch': 0.02}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0832, 'learning_rate': 1.9495548961424335e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0792, 'learning_rate': 1.9465875370919883e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0904, 'learning_rate': 1.943620178041543e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:40:24,200] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[1.940652818991098e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:40:24,201] [INFO] [timer.py:260:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=1.4606751628215193, CurrSamplesPerSec=1.4091414575825598, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0873, 'learning_rate': 1.940652818991098e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0904, 'learning_rate': 1.937685459940653e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0865, 'learning_rate': 1.9347181008902077e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.086, 'learning_rate': 1.931750741839763e-05, 'epoch': 0.03}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0857, 'learning_rate': 1.9287833827893176e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0874, 'learning_rate': 1.9258160237388724e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0836, 'learning_rate': 1.9228486646884275e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0862, 'learning_rate': 1.9198813056379823e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0844, 'learning_rate': 1.9169139465875374e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0846, 'learning_rate': 1.9139465875370922e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:42:20,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[1.910979228486647e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:42:20,855] [INFO] [timer.py:260:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=1.42790700193431, CurrSamplesPerSec=1.41897354735648, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0875, 'learning_rate': 1.910979228486647e-05, 'epoch': 0.04}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0824, 'learning_rate': 1.908011869436202e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0842, 'learning_rate': 1.905044510385757e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0826, 'learning_rate': 1.9020771513353117e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0857, 'learning_rate': 1.8991097922848668e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0825, 'learning_rate': 1.8961424332344216e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0818, 'learning_rate': 1.8931750741839763e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:43:37,944] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 256, but hysteresis is 2. Reducing hysteresis to 1
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0844, 'learning_rate': 1.8931750741839763e-05, 'epoch': 0.05}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0809, 'learning_rate': 1.8902077151335315e-05, 'epoch': 0.06}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0853, 'learning_rate': 1.8872403560830862e-05, 'epoch': 0.06}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:44:12,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=1, lr=[1.884272997032641e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:44:12,886] [INFO] [timer.py:260:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=1.428213078654362, CurrSamplesPerSec=1.4260356182595912, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0846, 'learning_rate': 1.884272997032641e-05, 'epoch': 0.06}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.084, 'learning_rate': 1.8813056379821958e-05, 'epoch': 0.06}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0746, 'learning_rate': 1.878338278931751e-05, 'epoch': 0.06}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0819, 'learning_rate': 1.8753709198813057e-05, 'epoch': 0.06}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0826, 'learning_rate': 1.8724035608308605e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0814, 'learning_rate': 1.8694362017804156e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 1.8664688427299704e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0833, 'learning_rate': 1.863501483679525e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0775, 'learning_rate': 1.8605341246290803e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0938, 'learning_rate': 1.857566765578635e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:46:01,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=1, lr=[1.8545994065281902e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:46:01,923] [INFO] [timer.py:260:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=1.4364017221690826, CurrSamplesPerSec=1.5660310882618487, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0829, 'learning_rate': 1.8545994065281902e-05, 'epoch': 0.07}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0758, 'learning_rate': 1.851632047477745e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0794, 'learning_rate': 1.8486646884272997e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0738, 'learning_rate': 1.845697329376855e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0822, 'learning_rate': 1.8427299703264096e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:46:52,112] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 256, reducing to 128
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0856, 'learning_rate': 1.8427299703264096e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0845, 'learning_rate': 1.8397626112759644e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0768, 'learning_rate': 1.8367952522255195e-05, 'epoch': 0.08}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0799, 'learning_rate': 1.8338278931750743e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0769, 'learning_rate': 1.830860534124629e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:47:45,508] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=2, lr=[1.8278931750741842e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:47:45,509] [INFO] [timer.py:260:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=1.4540972364106726, CurrSamplesPerSec=1.5508400837962362, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0813, 'learning_rate': 1.8278931750741842e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0755, 'learning_rate': 1.824925816023739e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0842, 'learning_rate': 1.821958456973294e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0753, 'learning_rate': 1.818991097922849e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0791, 'learning_rate': 1.8160237388724037e-05, 'epoch': 0.09}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.088, 'learning_rate': 1.8130563798219588e-05, 'epoch': 0.1}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0789, 'learning_rate': 1.8100890207715136e-05, 'epoch': 0.1}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0819, 'learning_rate': 1.8071216617210684e-05, 'epoch': 0.1}




[36m(RayTrainWorker pid=266237)[0m {'loss': 1.0042, 'learning_rate': 1.804154302670623e-05, 'epoch': 0.1}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0738, 'learning_rate': 1.801186943620178e-05, 'epoch': 0.1}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:49:33,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=2, lr=[1.798219584569733e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:49:33,346] [INFO] [timer.py:260:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=1.4585198994946096, CurrSamplesPerSec=1.5101015374268496, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0833, 'learning_rate': 1.798219584569733e-05, 'epoch': 0.1}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0781, 'learning_rate': 1.7952522255192878e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0799, 'learning_rate': 1.792284866468843e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0825, 'learning_rate': 1.7893175074183977e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 1.7863501483679525e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.08, 'learning_rate': 1.7833827893175076e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.074, 'learning_rate': 1.7804154302670624e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.088, 'learning_rate': 1.7774480712166172e-05, 'epoch': 0.11}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0829, 'learning_rate': 1.7744807121661723e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0826, 'learning_rate': 1.771513353115727e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:51:20,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=2, lr=[1.7685459940652822e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:51:20,990] [INFO] [timer.py:260:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=1.4621503771993423, CurrSamplesPerSec=1.5386634307185516, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.082, 'learning_rate': 1.7685459940652822e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0773, 'learning_rate': 1.765578635014837e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0839, 'learning_rate': 1.7626112759643918e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0764, 'learning_rate': 1.759643916913947e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0782, 'learning_rate': 1.7566765578635017e-05, 'epoch': 0.12}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.077, 'learning_rate': 1.7537091988130565e-05, 'epoch': 0.13}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.083, 'learning_rate': 1.7507418397626116e-05, 'epoch': 0.13}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.2017, 'learning_rate': 1.7477744807121664e-05, 'epoch': 0.13}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0764, 'learning_rate': 1.744807121661721e-05, 'epoch': 0.13}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0793, 'learning_rate': 1.7418397626112763e-05, 'epoch': 0.13}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:53:11,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=2, lr=[1.738872403560831e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:53:11,172] [INFO] [timer.py:260:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=1.4611024410325115, CurrSamplesPerSec=1.5565395843808918, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0699, 'learning_rate': 1.738872403560831e-05, 'epoch': 0.13}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0771, 'learning_rate': 1.735905044510386e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0816, 'learning_rate': 1.732937685459941e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0789, 'learning_rate': 1.7299703264094957e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.1288, 'learning_rate': 1.7270029673590505e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0747, 'learning_rate': 1.7240356083086053e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0836, 'learning_rate': 1.7210682492581604e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0752, 'learning_rate': 1.7181008902077152e-05, 'epoch': 0.14}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0839, 'learning_rate': 1.71513353115727e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0859, 'learning_rate': 1.712166172106825e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:54:59,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=2, lr=[1.70919881305638e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:54:59,953] [INFO] [timer.py:260:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=1.4621711299212994, CurrSamplesPerSec=1.4185404622423308, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0862, 'learning_rate': 1.70919881305638e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0748, 'learning_rate': 1.706231454005935e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0711, 'learning_rate': 1.7032640949554898e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0773, 'learning_rate': 1.7002967359050445e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.077, 'learning_rate': 1.6973293768545997e-05, 'epoch': 0.15}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.078, 'learning_rate': 1.6943620178041544e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0797, 'learning_rate': 1.6913946587537092e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.6884272997032643e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.083, 'learning_rate': 1.685459940652819e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 1.682492581602374e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:56:51,243] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=2, lr=[1.679525222551929e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:56:51,244] [INFO] [timer.py:260:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=1.4599463107917094, CurrSamplesPerSec=1.5484347680643802, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.073, 'learning_rate': 1.679525222551929e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0732, 'learning_rate': 1.6765578635014838e-05, 'epoch': 0.16}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0784, 'learning_rate': 1.673590504451039e-05, 'epoch': 0.17}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0738, 'learning_rate': 1.6706231454005937e-05, 'epoch': 0.17}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0825, 'learning_rate': 1.6676557863501485e-05, 'epoch': 0.17}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.077, 'learning_rate': 1.6646884272997036e-05, 'epoch': 0.17}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0736, 'learning_rate': 1.6617210682492584e-05, 'epoch': 0.17}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0718, 'learning_rate': 1.658753709198813e-05, 'epoch': 0.17}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0802, 'learning_rate': 1.6557863501483683e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0819, 'learning_rate': 1.6528189910979227e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:58:42,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=2, lr=[1.649851632047478e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 09:58:42,441] [INFO] [timer.py:260:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=1.458204584467073, CurrSamplesPerSec=1.4998425932022346, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0733, 'learning_rate': 1.649851632047478e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 1.6468842729970326e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 1.6439169139465877e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0846, 'learning_rate': 1.6409495548961425e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0721, 'learning_rate': 1.6379821958456973e-05, 'epoch': 0.18}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0804, 'learning_rate': 1.6350148367952524e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0814, 'learning_rate': 1.6320474777448072e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0758, 'learning_rate': 1.629080118694362e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0688, 'learning_rate': 1.626112759643917e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0762, 'learning_rate': 1.623145400593472e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:00:30,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=2, lr=[1.6201780415430267e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:00:30,129] [INFO] [timer.py:260:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=1.4603904442236388, CurrSamplesPerSec=1.5123468841171797, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0754, 'learning_rate': 1.6201780415430267e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0678, 'learning_rate': 1.6172106824925818e-05, 'epoch': 0.19}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 1.6142433234421366e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0737, 'learning_rate': 1.6112759643916917e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0754, 'learning_rate': 1.6083086053412465e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0737, 'learning_rate': 1.6053412462908013e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0768, 'learning_rate': 1.6023738872403564e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0755, 'learning_rate': 1.599406528189911e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0819, 'learning_rate': 1.596439169139466e-05, 'epoch': 0.2}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0803, 'learning_rate': 1.593471810089021e-05, 'epoch': 0.21}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:02:26,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=2, lr=[1.590504451038576e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:02:26,659] [INFO] [timer.py:260:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=1.453742970688458, CurrSamplesPerSec=1.2991509089118924, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.067, 'learning_rate': 1.590504451038576e-05, 'epoch': 0.21}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0779, 'learning_rate': 1.5875370919881306e-05, 'epoch': 0.21}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0763, 'learning_rate': 1.5845697329376857e-05, 'epoch': 0.21}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0738, 'learning_rate': 1.5816023738872405e-05, 'epoch': 0.21}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0846, 'learning_rate': 1.5786350148367956e-05, 'epoch': 0.21}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0771, 'learning_rate': 1.57566765578635e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.075, 'learning_rate': 1.5727002967359052e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.072, 'learning_rate': 1.56973293768546e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0867, 'learning_rate': 1.5667655786350148e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0726, 'learning_rate': 1.56379821958457e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:04:28,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=2, lr=[1.5608308605341247e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:04:28,735] [INFO] [timer.py:260:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=1.443154795702591, CurrSamplesPerSec=1.3994228258005221, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 1.5608308605341247e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0757, 'learning_rate': 1.5578635014836794e-05, 'epoch': 0.22}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0794, 'learning_rate': 1.5548961424332346e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0805, 'learning_rate': 1.5519287833827893e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0815, 'learning_rate': 1.5489614243323445e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0736, 'learning_rate': 1.5459940652818992e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0825, 'learning_rate': 1.543026706231454e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0703, 'learning_rate': 1.540059347181009e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0748, 'learning_rate': 1.537091988130564e-05, 'epoch': 0.23}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0732, 'learning_rate': 1.5341246290801187e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:06:24,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=2, lr=[1.5311572700296738e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:06:24,144] [INFO] [timer.py:260:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=1.4394767425182666, CurrSamplesPerSec=1.2914712515961564, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0702, 'learning_rate': 1.5311572700296738e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.071, 'learning_rate': 1.5281899109792286e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0821, 'learning_rate': 1.5252225519287836e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0833, 'learning_rate': 1.5222551928783385e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.5192878338278933e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0699, 'learning_rate': 1.5163204747774482e-05, 'epoch': 0.24}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.071, 'learning_rate': 1.5133531157270032e-05, 'epoch': 0.25}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0774, 'learning_rate': 1.5103857566765581e-05, 'epoch': 0.25}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0747, 'learning_rate': 1.507418397626113e-05, 'epoch': 0.25}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0661, 'learning_rate': 1.5044510385756679e-05, 'epoch': 0.25}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:08:23,297] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=2, lr=[1.5014836795252228e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:08:23,298] [INFO] [timer.py:260:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=1.4333905535990081, CurrSamplesPerSec=1.3257681940317414, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0867, 'learning_rate': 1.5014836795252228e-05, 'epoch': 0.25}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0748, 'learning_rate': 1.4985163204747774e-05, 'epoch': 0.25}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0758, 'learning_rate': 1.4955489614243324e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0756, 'learning_rate': 1.4925816023738873e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.071, 'learning_rate': 1.4896142433234421e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0684, 'learning_rate': 1.486646884272997e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0741, 'learning_rate': 1.483679525222552e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0782, 'learning_rate': 1.480712166172107e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 1.4777448071216617e-05, 'epoch': 0.26}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0755, 'learning_rate': 1.4747774480712167e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:10:20,694] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=2, lr=[1.4718100890207716e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:10:20,694] [INFO] [timer.py:260:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=1.4292820318925363, CurrSamplesPerSec=1.4941810670405709, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0725, 'learning_rate': 1.4718100890207716e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0807, 'learning_rate': 1.4688427299703266e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0748, 'learning_rate': 1.4658753709198814e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0763, 'learning_rate': 1.4629080118694363e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0707, 'learning_rate': 1.4599406528189913e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 1.456973293768546e-05, 'epoch': 0.27}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0733, 'learning_rate': 1.454005934718101e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0715, 'learning_rate': 1.451038575667656e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0785, 'learning_rate': 1.4480712166172109e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0773, 'learning_rate': 1.4451038575667657e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:12:11,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=2, lr=[1.4421364985163206e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:12:11,337] [INFO] [timer.py:260:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=1.4302127939863158, CurrSamplesPerSec=1.4997901688531732, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0735, 'learning_rate': 1.4421364985163206e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0776, 'learning_rate': 1.4391691394658756e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0748, 'learning_rate': 1.4362017804154305e-05, 'epoch': 0.28}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0789, 'learning_rate': 1.4332344213649853e-05, 'epoch': 0.29}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0746, 'learning_rate': 1.4302670623145403e-05, 'epoch': 0.29}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0673, 'learning_rate': 1.4272997032640952e-05, 'epoch': 0.29}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0744, 'learning_rate': 1.42433234421365e-05, 'epoch': 0.29}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.4213649851632048e-05, 'epoch': 0.29}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0803, 'learning_rate': 1.4183976261127597e-05, 'epoch': 0.29}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0735, 'learning_rate': 1.4154302670623145e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:14:10,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=2, lr=[1.4124629080118695e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:14:10,329] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=1.425669896745559, CurrSamplesPerSec=1.241231910324683, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0662, 'learning_rate': 1.4124629080118695e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0777, 'learning_rate': 1.4094955489614244e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.07, 'learning_rate': 1.4065281899109794e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0751, 'learning_rate': 1.4035608308605341e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 1.4005934718100891e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0716, 'learning_rate': 1.397626112759644e-05, 'epoch': 0.3}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0795, 'learning_rate': 1.3946587537091988e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0705, 'learning_rate': 1.3916913946587538e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0705, 'learning_rate': 1.3887240356083087e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0712, 'learning_rate': 1.3857566765578637e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:16:02,998] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=2, lr=[1.3827893175074185e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:16:02,999] [INFO] [timer.py:260:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=1.4254382213555827, CurrSamplesPerSec=1.483051053841521, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0787, 'learning_rate': 1.3827893175074185e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0647, 'learning_rate': 1.3798219584569734e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0703, 'learning_rate': 1.3768545994065284e-05, 'epoch': 0.31}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0822, 'learning_rate': 1.3738872403560833e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0759, 'learning_rate': 1.370919881305638e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0673, 'learning_rate': 1.367952522255193e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 1.364985163204748e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0717, 'learning_rate': 1.3620178041543028e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0669, 'learning_rate': 1.3590504451038577e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0801, 'learning_rate': 1.3560830860534127e-05, 'epoch': 0.32}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:17:55,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=2, lr=[1.3531157270029676e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:17:55,947] [INFO] [timer.py:260:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=1.42506799315921, CurrSamplesPerSec=1.454071514662017, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 1.3531157270029676e-05, 'epoch': 0.33}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0719, 'learning_rate': 1.3501483679525224e-05, 'epoch': 0.33}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0738, 'learning_rate': 1.3471810089020773e-05, 'epoch': 0.33}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0727, 'learning_rate': 1.3442136498516321e-05, 'epoch': 0.33}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0754, 'learning_rate': 1.3412462908011869e-05, 'epoch': 0.33}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0784, 'learning_rate': 1.3382789317507419e-05, 'epoch': 0.33}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0745, 'learning_rate': 1.3353115727002968e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0716, 'learning_rate': 1.3323442136498516e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0667, 'learning_rate': 1.3293768545994065e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0743, 'learning_rate': 1.3264094955489615e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:19:48,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=2, lr=[1.3234421364985164e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:19:48,281] [INFO] [timer.py:260:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=1.4250740855704207, CurrSamplesPerSec=1.3658250937232181, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0784, 'learning_rate': 1.3234421364985164e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0727, 'learning_rate': 1.3204747774480712e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0735, 'learning_rate': 1.3175074183976262e-05, 'epoch': 0.34}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0659, 'learning_rate': 1.3145400593471811e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0682, 'learning_rate': 1.311572700296736e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0759, 'learning_rate': 1.3086053412462909e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0743, 'learning_rate': 1.3056379821958458e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0753, 'learning_rate': 1.3026706231454008e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0784, 'learning_rate': 1.2997032640949557e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0649, 'learning_rate': 1.2967359050445105e-05, 'epoch': 0.35}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:21:45,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=2, lr=[1.2937685459940654e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:21:45,059] [INFO] [timer.py:260:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=1.4227129144031079, CurrSamplesPerSec=1.3564809648298413, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0682, 'learning_rate': 1.2937685459940654e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 1.2908011869436204e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0746, 'learning_rate': 1.2878338278931752e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0736, 'learning_rate': 1.2848664688427301e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0751, 'learning_rate': 1.281899109792285e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 1.27893175074184e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0785, 'learning_rate': 1.2759643916913948e-05, 'epoch': 0.36}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0703, 'learning_rate': 1.2729970326409497e-05, 'epoch': 0.37}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0741, 'learning_rate': 1.2700296735905047e-05, 'epoch': 0.37}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0696, 'learning_rate': 1.2670623145400593e-05, 'epoch': 0.37}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:23:38,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=2, lr=[1.2640949554896143e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:23:38,988] [INFO] [timer.py:260:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=1.4219963962865254, CurrSamplesPerSec=1.4375946952007732, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0707, 'learning_rate': 1.2640949554896143e-05, 'epoch': 0.37}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0702, 'learning_rate': 1.2611275964391692e-05, 'epoch': 0.37}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0763, 'learning_rate': 1.258160237388724e-05, 'epoch': 0.37}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0763, 'learning_rate': 1.255192878338279e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0687, 'learning_rate': 1.2522255192878339e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0736, 'learning_rate': 1.2492581602373888e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0756, 'learning_rate': 1.2462908011869436e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0782, 'learning_rate': 1.2433234421364986e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0711, 'learning_rate': 1.2403560830860535e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0728, 'learning_rate': 1.2373887240356085e-05, 'epoch': 0.38}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:25:32,211] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=2, lr=[1.2344213649851633e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:25:32,212] [INFO] [timer.py:260:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=1.4216818931347666, CurrSamplesPerSec=1.469528436524472, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0676, 'learning_rate': 1.2344213649851633e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 1.2314540059347182e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0785, 'learning_rate': 1.2284866468842732e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0799, 'learning_rate': 1.225519287833828e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 1.2225519287833829e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0718, 'learning_rate': 1.2195845697329378e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0732, 'learning_rate': 1.2166172106824928e-05, 'epoch': 0.39}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.073, 'learning_rate': 1.2136498516320476e-05, 'epoch': 0.4}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0755, 'learning_rate': 1.2106824925816025e-05, 'epoch': 0.4}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0734, 'learning_rate': 1.2077151335311575e-05, 'epoch': 0.4}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:27:23,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=2, lr=[1.2047477744807124e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:27:23,556] [INFO] [timer.py:260:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=1.4222797532488014, CurrSamplesPerSec=1.395411677325289, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0798, 'learning_rate': 1.2047477744807124e-05, 'epoch': 0.4}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 1.2017804154302672e-05, 'epoch': 0.4}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0727, 'learning_rate': 1.1988130563798221e-05, 'epoch': 0.4}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0769, 'learning_rate': 1.1958456973293771e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 1.1928783382789319e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.072, 'learning_rate': 1.1899109792284867e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0696, 'learning_rate': 1.1869436201780416e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 1.1839762611275964e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 1.1810089020771513e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0763, 'learning_rate': 1.1780415430267063e-05, 'epoch': 0.41}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:29:18,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=2, lr=[1.1750741839762612e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:29:18,377] [INFO] [timer.py:260:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=1.4212530761333448, CurrSamplesPerSec=1.392234919461031, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.1750741839762612e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0733, 'learning_rate': 1.172106824925816e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0776, 'learning_rate': 1.169139465875371e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.166172106824926e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0689, 'learning_rate': 1.1632047477744807e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0734, 'learning_rate': 1.1602373887240357e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 1.1572700296735906e-05, 'epoch': 0.42}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0757, 'learning_rate': 1.1543026706231456e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0794, 'learning_rate': 1.1513353115727003e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0753, 'learning_rate': 1.1483679525222553e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:31:07,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=2, lr=[1.1454005934718102e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:31:07,615] [INFO] [timer.py:260:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=1.4227441513527765, CurrSamplesPerSec=1.477182417133088, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0707, 'learning_rate': 1.1454005934718102e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 1.1424332344213652e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 1.13946587537092e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0688, 'learning_rate': 1.1364985163204749e-05, 'epoch': 0.43}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0679, 'learning_rate': 1.1335311572700299e-05, 'epoch': 0.44}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 1.1305637982195846e-05, 'epoch': 0.44}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0723, 'learning_rate': 1.1275964391691396e-05, 'epoch': 0.44}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0782, 'learning_rate': 1.1246290801186945e-05, 'epoch': 0.44}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0758, 'learning_rate': 1.1216617210682495e-05, 'epoch': 0.44}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 1.1186943620178043e-05, 'epoch': 0.44}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:33:16,249] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=2, lr=[1.1157270029673592e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:33:16,250] [INFO] [timer.py:260:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=1.415939168578516, CurrSamplesPerSec=1.1570177408597069, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0742, 'learning_rate': 1.1157270029673592e-05, 'epoch': 0.45}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0717, 'learning_rate': 1.112759643916914e-05, 'epoch': 0.45}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0686, 'learning_rate': 1.1068249258160237e-05, 'epoch': 0.45}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0735, 'learning_rate': 1.1038575667655787e-05, 'epoch': 0.45}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 1.1008902077151335e-05, 'epoch': 0.45}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.0979228486646884e-05, 'epoch': 0.45}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0822, 'learning_rate': 1.0949554896142434e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0781, 'learning_rate': 1.0919881305637983e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0733, 'learning_rate': 1.0890207715133531e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:35:32,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=2, lr=[1.086053412462908e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:35:32,102] [INFO] [timer.py:260:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=1.4067350886798402, CurrSamplesPerSec=1.19732204961454, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0632, 'learning_rate': 1.086053412462908e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.071, 'learning_rate': 1.083086053412463e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0747, 'learning_rate': 1.080118694362018e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0691, 'learning_rate': 1.0771513353115727e-05, 'epoch': 0.46}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0728, 'learning_rate': 1.0741839762611277e-05, 'epoch': 0.47}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.077, 'learning_rate': 1.0712166172106826e-05, 'epoch': 0.47}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0723, 'learning_rate': 1.0652818991097924e-05, 'epoch': 0.47}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.077, 'learning_rate': 1.0623145400593473e-05, 'epoch': 0.47}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0726, 'learning_rate': 1.0593471810089023e-05, 'epoch': 0.47}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:37:48,506] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=2, lr=[1.056379821958457e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:37:48,506] [INFO] [timer.py:260:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=1.398005895709546, CurrSamplesPerSec=1.1854848062208285, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0725, 'learning_rate': 1.056379821958457e-05, 'epoch': 0.47}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 1.053412462908012e-05, 'epoch': 0.48}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0728, 'learning_rate': 1.050445103857567e-05, 'epoch': 0.48}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 1.0474777448071219e-05, 'epoch': 0.48}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0735, 'learning_rate': 1.0445103857566767e-05, 'epoch': 0.48}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0724, 'learning_rate': 1.0415430267062316e-05, 'epoch': 0.48}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0711, 'learning_rate': 1.0385756676557866e-05, 'epoch': 0.48}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 1.0356083086053412e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0691, 'learning_rate': 1.0326409495548961e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0749, 'learning_rate': 1.0296735905044511e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:40:04,187] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=2, lr=[1.0267062314540059e-05], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:40:04,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=1.3901709690913864, CurrSamplesPerSec=1.1747615220874748, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0722, 'learning_rate': 1.0267062314540059e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0751, 'learning_rate': 1.0237388724035608e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0655, 'learning_rate': 1.0207715133531158e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0768, 'learning_rate': 1.0178041543026707e-05, 'epoch': 0.49}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0683, 'learning_rate': 1.0148367952522255e-05, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0699, 'learning_rate': 1.0118694362017805e-05, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0608, 'learning_rate': 1.0089020771513354e-05, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.074, 'learning_rate': 1.0059347181008904e-05, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0673, 'learning_rate': 1.0029673590504451e-05, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 1e-05, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:42:19,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=2, lr=[9.97032640949555e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:42:19,931] [INFO] [timer.py:260:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=1.3828606602140774, CurrSamplesPerSec=1.1906250678065446, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.071, 'learning_rate': 9.97032640949555e-06, 'epoch': 0.5}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 9.940652818991098e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0662, 'learning_rate': 9.910979228486648e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0665, 'learning_rate': 9.881305637982197e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0696, 'learning_rate': 9.851632047477747e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0698, 'learning_rate': 9.821958456973294e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0702, 'learning_rate': 9.792284866468842e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0753, 'learning_rate': 9.762611275964392e-06, 'epoch': 0.51}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 9.732937685459941e-06, 'epoch': 0.52}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0696, 'learning_rate': 9.70326409495549e-06, 'epoch': 0.52}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:44:35,896] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=2, lr=[9.673590504451039e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:44:35,897] [INFO] [timer.py:260:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=1.375960044841528, CurrSamplesPerSec=1.1850680452655977, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0721, 'learning_rate': 9.673590504451039e-06, 'epoch': 0.52}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0706, 'learning_rate': 9.643916913946588e-06, 'epoch': 0.52}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 9.614243323442138e-06, 'epoch': 0.52}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0719, 'learning_rate': 9.584569732937687e-06, 'epoch': 0.52}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 9.554896142433235e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.063, 'learning_rate': 9.525222551928784e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.073, 'learning_rate': 9.495548961424334e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0722, 'learning_rate': 9.465875370919882e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0668, 'learning_rate': 9.436201780415431e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 9.406528189910979e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:46:52,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=2, lr=[9.376854599406528e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:46:52,234] [INFO] [timer.py:260:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=1.3693895729391865, CurrSamplesPerSec=1.1919664118036761, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 9.376854599406528e-06, 'epoch': 0.53}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0717, 'learning_rate': 9.347181008902078e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0733, 'learning_rate': 9.317507418397626e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 9.287833827893175e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0718, 'learning_rate': 9.258160237388725e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0617, 'learning_rate': 9.228486646884274e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0717, 'learning_rate': 9.198813056379822e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0699, 'learning_rate': 9.169139465875372e-06, 'epoch': 0.54}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0712, 'learning_rate': 9.139465875370921e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0713, 'learning_rate': 9.10979228486647e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:48:52,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=2, lr=[9.080118694362018e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:48:52,857] [INFO] [timer.py:260:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=1.3682045892787917, CurrSamplesPerSec=1.4740112870629456, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 9.080118694362018e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 9.050445103857568e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0691, 'learning_rate': 9.020771513353116e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.068, 'learning_rate': 8.991097922848665e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0649, 'learning_rate': 8.961424332344215e-06, 'epoch': 0.55}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.068, 'learning_rate': 8.931750741839763e-06, 'epoch': 0.56}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0728, 'learning_rate': 8.902077151335312e-06, 'epoch': 0.56}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 8.872403560830862e-06, 'epoch': 0.56}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0646, 'learning_rate': 8.842729970326411e-06, 'epoch': 0.56}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.075, 'learning_rate': 8.813056379821959e-06, 'epoch': 0.56}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:51:05,148] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=2, lr=[8.783382789317508e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:51:05,148] [INFO] [timer.py:260:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=1.3634900852148988, CurrSamplesPerSec=1.1468876580511347, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0651, 'learning_rate': 8.783382789317508e-06, 'epoch': 0.56}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0769, 'learning_rate': 8.753709198813058e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0776, 'learning_rate': 8.724035608308606e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0717, 'learning_rate': 8.694362017804155e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0659, 'learning_rate': 8.664688427299705e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0709, 'learning_rate': 8.635014836795252e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0706, 'learning_rate': 8.605341246290802e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0663, 'learning_rate': 8.57566765578635e-06, 'epoch': 0.57}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0676, 'learning_rate': 8.5459940652819e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0679, 'learning_rate': 8.516320474777449e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:53:23,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=2, lr=[8.486646884272998e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:53:23,730] [INFO] [timer.py:260:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=1.3571831619277281, CurrSamplesPerSec=1.1715647228366306, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0698, 'learning_rate': 8.486646884272998e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0777, 'learning_rate': 8.456973293768546e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0723, 'learning_rate': 8.427299703264096e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0768, 'learning_rate': 8.397626112759645e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0687, 'learning_rate': 8.367952522255195e-06, 'epoch': 0.58}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0654, 'learning_rate': 8.338278931750742e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0757, 'learning_rate': 8.308605341246292e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0754, 'learning_rate': 8.278931750741841e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0667, 'learning_rate': 8.24925816023739e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0652, 'learning_rate': 8.219584569732939e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:55:19,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=2, lr=[8.189910979228487e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:55:19,237] [INFO] [timer.py:260:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=1.357892920380424, CurrSamplesPerSec=1.462658190327069, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0636, 'learning_rate': 8.189910979228487e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0693, 'learning_rate': 8.160237388724036e-06, 'epoch': 0.59}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0782, 'learning_rate': 8.130563798219586e-06, 'epoch': 0.6}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0706, 'learning_rate': 8.100890207715133e-06, 'epoch': 0.6}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0634, 'learning_rate': 8.071216617210683e-06, 'epoch': 0.6}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0704, 'learning_rate': 8.041543026706232e-06, 'epoch': 0.6}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0719, 'learning_rate': 8.011869436201782e-06, 'epoch': 0.6}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 7.98219584569733e-06, 'epoch': 0.6}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0746, 'learning_rate': 7.95252225519288e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0772, 'learning_rate': 7.922848664688429e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:57:18,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=2, lr=[7.893175074183978e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:57:18,990] [INFO] [timer.py:260:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=1.3573676587189123, CurrSamplesPerSec=1.465095319697681, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0639, 'learning_rate': 7.893175074183978e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 7.863501483679526e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0614, 'learning_rate': 7.833827893175074e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0658, 'learning_rate': 7.804154302670623e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0746, 'learning_rate': 7.774480712166173e-06, 'epoch': 0.61}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0631, 'learning_rate': 7.744807121661722e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 7.71513353115727e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0619, 'learning_rate': 7.68545994065282e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0705, 'learning_rate': 7.655786350148369e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0684, 'learning_rate': 7.626112759643918e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:59:11,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=2, lr=[7.5964391691394664e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 10:59:11,341] [INFO] [timer.py:260:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=1.3589086928909901, CurrSamplesPerSec=1.446031804791581, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0795, 'learning_rate': 7.5964391691394664e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 7.566765578635016e-06, 'epoch': 0.62}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0652, 'learning_rate': 7.537091988130565e-06, 'epoch': 0.63}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0758, 'learning_rate': 7.507418397626114e-06, 'epoch': 0.63}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 7.477744807121662e-06, 'epoch': 0.63}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0701, 'learning_rate': 7.4480712166172105e-06, 'epoch': 0.63}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0673, 'learning_rate': 7.41839762611276e-06, 'epoch': 0.63}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.075, 'learning_rate': 7.388724035608309e-06, 'epoch': 0.63}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0769, 'learning_rate': 7.359050445103858e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.072, 'learning_rate': 7.329376854599407e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:01:09,448] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=2, lr=[7.299703264094956e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:01:09,449] [INFO] [timer.py:260:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=1.3588286616797223, CurrSamplesPerSec=1.295884026167993, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0738, 'learning_rate': 7.299703264094956e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0687, 'learning_rate': 7.270029673590505e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0759, 'learning_rate': 7.2403560830860545e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0748, 'learning_rate': 7.210682492581603e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.058, 'learning_rate': 7.181008902077153e-06, 'epoch': 0.64}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0698, 'learning_rate': 7.151335311572701e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.067, 'learning_rate': 7.12166172106825e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 7.091988130563799e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 7.062314540059347e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0702, 'learning_rate': 7.032640949554897e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:03:09,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=2, lr=[7.0029673590504455e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:03:09,867] [INFO] [timer.py:260:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=1.3581424531574693, CurrSamplesPerSec=1.4834367099836339, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 7.0029673590504455e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 6.973293768545994e-06, 'epoch': 0.65}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0687, 'learning_rate': 6.943620178041544e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0683, 'learning_rate': 6.913946587537092e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0665, 'learning_rate': 6.884272997032642e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0668, 'learning_rate': 6.85459940652819e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.069, 'learning_rate': 6.82492581602374e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0659, 'learning_rate': 6.795252225519289e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0791, 'learning_rate': 6.765578635014838e-06, 'epoch': 0.66}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 6.735905044510387e-06, 'epoch': 0.67}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:05:05,970] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=2, lr=[6.7062314540059345e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:05:05,970] [INFO] [timer.py:260:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=1.358597744805287, CurrSamplesPerSec=1.47176220376556, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 6.7062314540059345e-06, 'epoch': 0.67}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0783, 'learning_rate': 6.676557863501484e-06, 'epoch': 0.67}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 6.646884272997033e-06, 'epoch': 0.67}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0725, 'learning_rate': 6.617210682492582e-06, 'epoch': 0.67}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0627, 'learning_rate': 6.587537091988131e-06, 'epoch': 0.67}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0655, 'learning_rate': 6.55786350148368e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0708, 'learning_rate': 6.528189910979229e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0623, 'learning_rate': 6.4985163204747785e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0664, 'learning_rate': 6.468842729970327e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0675, 'learning_rate': 6.439169139465876e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:07:01,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=2, lr=[6.409495548961425e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:07:01,410] [INFO] [timer.py:260:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=1.3592000638128947, CurrSamplesPerSec=1.3780504976271606, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.074, 'learning_rate': 6.409495548961425e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 6.379821958456974e-06, 'epoch': 0.68}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0678, 'learning_rate': 6.3501483679525235e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0751, 'learning_rate': 6.320474777448071e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0668, 'learning_rate': 6.29080118694362e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0742, 'learning_rate': 6.2611275964391694e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0723, 'learning_rate': 6.231454005934718e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0732, 'learning_rate': 6.201780415430268e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 6.172106824925816e-06, 'epoch': 0.69}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0724, 'learning_rate': 6.142433234421366e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:08:57,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=2, lr=[6.112759643916914e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:08:57,423] [INFO] [timer.py:260:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=1.35963789128355, CurrSamplesPerSec=1.3906027974569628, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0728, 'learning_rate': 6.112759643916914e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 6.083086053412464e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0658, 'learning_rate': 6.0534124629080126e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0713, 'learning_rate': 6.023738872403562e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 5.994065281899111e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 5.964391691394659e-06, 'epoch': 0.7}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 5.934718100890208e-06, 'epoch': 0.71}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 5.905044510385757e-06, 'epoch': 0.71}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 5.875370919881306e-06, 'epoch': 0.71}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.071, 'learning_rate': 5.845697329376855e-06, 'epoch': 0.71}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:10:48,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=2, lr=[5.8160237388724035e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:10:48,982] [INFO] [timer.py:260:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=1.3611335266433635, CurrSamplesPerSec=1.4633030057506973, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0689, 'learning_rate': 5.8160237388724035e-06, 'epoch': 0.71}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.068, 'learning_rate': 5.786350148367953e-06, 'epoch': 0.71}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0699, 'learning_rate': 5.756676557863502e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0643, 'learning_rate': 5.727002967359051e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0688, 'learning_rate': 5.6973293768546e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 5.667655786350149e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0687, 'learning_rate': 5.637982195845698e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 5.6083086053412475e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0683, 'learning_rate': 5.578635014836796e-06, 'epoch': 0.72}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 5.548961424332344e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:12:46,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=2, lr=[5.5192878338278934e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:12:46,054] [INFO] [timer.py:260:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=1.3612631312730614, CurrSamplesPerSec=1.4725529994202726, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0682, 'learning_rate': 5.5192878338278934e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0682, 'learning_rate': 5.489614243323442e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.065, 'learning_rate': 5.459940652818992e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 5.43026706231454e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 5.40059347181009e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0734, 'learning_rate': 5.370919881305638e-06, 'epoch': 0.73}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0627, 'learning_rate': 5.341246290801188e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0647, 'learning_rate': 5.3115727002967366e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0686, 'learning_rate': 5.281899109792285e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0687, 'learning_rate': 5.252225519287835e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:14:44,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=2, lr=[5.222551928783383e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:14:44,493] [INFO] [timer.py:260:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=1.361068355380878, CurrSamplesPerSec=1.3893270671848414, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0649, 'learning_rate': 5.222551928783383e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 5.192878338278933e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.064, 'learning_rate': 5.163204747774481e-06, 'epoch': 0.74}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0733, 'learning_rate': 5.133531157270029e-06, 'epoch': 0.75}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0767, 'learning_rate': 5.103857566765579e-06, 'epoch': 0.75}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0661, 'learning_rate': 5.0741839762611275e-06, 'epoch': 0.75}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0711, 'learning_rate': 5.044510385756677e-06, 'epoch': 0.75}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 5.014836795252226e-06, 'epoch': 0.75}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0759, 'learning_rate': 4.985163204747775e-06, 'epoch': 0.75}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0678, 'learning_rate': 4.955489614243324e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:16:38,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=2, lr=[4.925816023738873e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:16:38,462] [INFO] [timer.py:260:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=1.3619009900159735, CurrSamplesPerSec=1.4955967957895262, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0637, 'learning_rate': 4.925816023738873e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0657, 'learning_rate': 4.896142433234421e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0819, 'learning_rate': 4.866468842729971e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0675, 'learning_rate': 4.836795252225519e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0659, 'learning_rate': 4.807121661721069e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0668, 'learning_rate': 4.7774480712166174e-06, 'epoch': 0.76}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0708, 'learning_rate': 4.747774480712167e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.065, 'learning_rate': 4.718100890207716e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0659, 'learning_rate': 4.688427299703264e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0729, 'learning_rate': 4.658753709198813e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:18:30,797] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=2, lr=[4.629080118694362e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:18:30,797] [INFO] [timer.py:260:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=1.3630673707532712, CurrSamplesPerSec=1.2977156221654895, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0665, 'learning_rate': 4.629080118694362e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0657, 'learning_rate': 4.599406528189911e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.07, 'learning_rate': 4.5697329376854606e-06, 'epoch': 0.77}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 4.540059347181009e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0723, 'learning_rate': 4.510385756676558e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0671, 'learning_rate': 4.480712166172107e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.061, 'learning_rate': 4.451038575667656e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.066, 'learning_rate': 4.4213649851632055e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 4.391691394658754e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0649, 'learning_rate': 4.362017804154303e-06, 'epoch': 0.78}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:20:28,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=2, lr=[4.332344213649852e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:20:28,039] [INFO] [timer.py:260:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=1.363112414156186, CurrSamplesPerSec=1.333315081316411, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0674, 'learning_rate': 4.332344213649852e-06, 'epoch': 0.79}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0744, 'learning_rate': 4.302670623145401e-06, 'epoch': 0.79}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0629, 'learning_rate': 4.27299703264095e-06, 'epoch': 0.79}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0699, 'learning_rate': 4.243323442136499e-06, 'epoch': 0.79}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0715, 'learning_rate': 4.213649851632048e-06, 'epoch': 0.79}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0616, 'learning_rate': 4.183976261127597e-06, 'epoch': 0.79}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0702, 'learning_rate': 4.154302670623146e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 4.124629080118695e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0665, 'learning_rate': 4.094955489614243e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0713, 'learning_rate': 4.065281899109793e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:22:26,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=2, lr=[4.0356083086053414e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:22:26,571] [INFO] [timer.py:260:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=1.362878548411476, CurrSamplesPerSec=1.365324722276532, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 4.0356083086053414e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0592, 'learning_rate': 4.005934718100891e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 3.97626112759644e-06, 'epoch': 0.8}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 3.946587537091989e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0718, 'learning_rate': 3.916913946587537e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0676, 'learning_rate': 3.887240356083086e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0679, 'learning_rate': 3.857566765578635e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 3.8278931750741846e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0671, 'learning_rate': 3.7982195845697332e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0682, 'learning_rate': 3.7685459940652823e-06, 'epoch': 0.81}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:24:23,244] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=2, lr=[3.738872403560831e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:24:23,245] [INFO] [timer.py:260:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=1.363044652320043, CurrSamplesPerSec=1.4663934771546407, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0617, 'learning_rate': 3.738872403560831e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0629, 'learning_rate': 3.70919881305638e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0714, 'learning_rate': 3.679525222551929e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 3.649851632047478e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0662, 'learning_rate': 3.6201780415430273e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0689, 'learning_rate': 3.5905044510385763e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 3.560830860534125e-06, 'epoch': 0.82}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0652, 'learning_rate': 3.5311572700296736e-06, 'epoch': 0.83}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0656, 'learning_rate': 3.5014836795252227e-06, 'epoch': 0.83}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0696, 'learning_rate': 3.471810089020772e-06, 'epoch': 0.83}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:26:21,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=2, lr=[3.442136498516321e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:26:21,993] [INFO] [timer.py:260:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=1.362774821988641, CurrSamplesPerSec=1.4368364688843096, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 3.442136498516321e-06, 'epoch': 0.83}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0722, 'learning_rate': 3.41246290801187e-06, 'epoch': 0.83}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0685, 'learning_rate': 3.382789317507419e-06, 'epoch': 0.83}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.075, 'learning_rate': 3.3531157270029673e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0664, 'learning_rate': 3.3234421364985163e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0707, 'learning_rate': 3.2937685459940654e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0659, 'learning_rate': 3.2640949554896145e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0723, 'learning_rate': 3.2344213649851636e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0684, 'learning_rate': 3.2047477744807127e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0641, 'learning_rate': 3.1750741839762617e-06, 'epoch': 0.84}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:28:15,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=2, lr=[3.14540059347181e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:28:15,594] [INFO] [timer.py:260:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=1.3635660908456864, CurrSamplesPerSec=1.4710722156335951, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0703, 'learning_rate': 3.14540059347181e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 3.115727002967359e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 3.086053412462908e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 3.056379821958457e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0686, 'learning_rate': 3.0267062314540063e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0772, 'learning_rate': 2.9970326409495554e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0643, 'learning_rate': 2.967359050445104e-06, 'epoch': 0.85}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0665, 'learning_rate': 2.937685459940653e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0667, 'learning_rate': 2.9080118694362018e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0589, 'learning_rate': 2.878338278931751e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:30:08,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=2, lr=[2.8486646884273e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:30:08,142] [INFO] [timer.py:260:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=1.364542072248843, CurrSamplesPerSec=1.36089066985872, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0642, 'learning_rate': 2.8486646884273e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0621, 'learning_rate': 2.818991097922849e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0669, 'learning_rate': 2.789317507418398e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0622, 'learning_rate': 2.7596439169139467e-06, 'epoch': 0.86}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0646, 'learning_rate': 2.729970326409496e-06, 'epoch': 0.87}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.067, 'learning_rate': 2.700296735905045e-06, 'epoch': 0.87}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0689, 'learning_rate': 2.670623145400594e-06, 'epoch': 0.87}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 2.6409495548961426e-06, 'epoch': 0.87}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.074, 'learning_rate': 2.6112759643916917e-06, 'epoch': 0.87}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0675, 'learning_rate': 2.5816023738872403e-06, 'epoch': 0.87}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:32:06,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=2, lr=[2.5519287833827894e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:32:06,932] [INFO] [timer.py:260:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=1.3642505988223539, CurrSamplesPerSec=1.2903822401480263, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0654, 'learning_rate': 2.5519287833827894e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0653, 'learning_rate': 2.5222551928783385e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0676, 'learning_rate': 2.4925816023738876e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.069, 'learning_rate': 2.4629080118694367e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0678, 'learning_rate': 2.4332344213649853e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0625, 'learning_rate': 2.4035608308605344e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0604, 'learning_rate': 2.3738872403560835e-06, 'epoch': 0.88}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.073, 'learning_rate': 2.344213649851632e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0651, 'learning_rate': 2.314540059347181e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0627, 'learning_rate': 2.2848664688427303e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:34:05,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=2, lr=[2.255192878338279e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:34:05,448] [INFO] [timer.py:260:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=1.3640238829892728, CurrSamplesPerSec=1.2919688386703465, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0683, 'learning_rate': 2.255192878338279e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0657, 'learning_rate': 2.225519287833828e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0694, 'learning_rate': 2.195845697329377e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0721, 'learning_rate': 2.166172106824926e-06, 'epoch': 0.89}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0678, 'learning_rate': 2.136498516320475e-06, 'epoch': 0.9}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0673, 'learning_rate': 2.106824925816024e-06, 'epoch': 0.9}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0701, 'learning_rate': 2.077151335311573e-06, 'epoch': 0.9}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0756, 'learning_rate': 2.0474777448071216e-06, 'epoch': 0.9}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0672, 'learning_rate': 2.0178041543026707e-06, 'epoch': 0.9}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0763, 'learning_rate': 1.98813056379822e-06, 'epoch': 0.9}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:36:01,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=2, lr=[1.9584569732937684e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:36:01,386] [INFO] [timer.py:260:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=1.3642953944115868, CurrSamplesPerSec=1.3425842978557379, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0651, 'learning_rate': 1.9584569732937684e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0732, 'learning_rate': 1.9287833827893175e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0636, 'learning_rate': 1.8991097922848666e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0661, 'learning_rate': 1.8694362017804155e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0683, 'learning_rate': 1.8397626112759646e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0754, 'learning_rate': 1.8100890207715136e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0646, 'learning_rate': 1.7804154302670625e-06, 'epoch': 0.91}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0597, 'learning_rate': 1.7507418397626114e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0569, 'learning_rate': 1.7210682492581604e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0623, 'learning_rate': 1.6913946587537095e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:37:54,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=2, lr=[1.6617210682492582e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:37:54,918] [INFO] [timer.py:260:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=1.3650152450156101, CurrSamplesPerSec=1.4735959566881496, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0668, 'learning_rate': 1.6617210682492582e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0661, 'learning_rate': 1.6320474777448073e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0645, 'learning_rate': 1.6023738872403563e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0657, 'learning_rate': 1.572700296735905e-06, 'epoch': 0.92}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0597, 'learning_rate': 1.543026706231454e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0686, 'learning_rate': 1.5133531157270031e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0708, 'learning_rate': 1.483679525222552e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0716, 'learning_rate': 1.4540059347181009e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0639, 'learning_rate': 1.42433234421365e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0692, 'learning_rate': 1.394658753709199e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:39:56,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=2, lr=[1.364985163204748e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:39:56,912] [INFO] [timer.py:260:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=1.3641421531614368, CurrSamplesPerSec=1.3287658338365542, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0639, 'learning_rate': 1.364985163204748e-06, 'epoch': 0.93}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0669, 'learning_rate': 1.335311572700297e-06, 'epoch': 0.94}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0667, 'learning_rate': 1.3056379821958458e-06, 'epoch': 0.94}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0679, 'learning_rate': 1.2759643916913947e-06, 'epoch': 0.94}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0697, 'learning_rate': 1.2462908011869438e-06, 'epoch': 0.94}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0664, 'learning_rate': 1.2166172106824927e-06, 'epoch': 0.94}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0718, 'learning_rate': 1.1869436201780417e-06, 'epoch': 0.94}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0608, 'learning_rate': 1.1572700296735906e-06, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 1.1275964391691395e-06, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0731, 'learning_rate': 1.0979228486646885e-06, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:42:12,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=2, lr=[1.0682492581602374e-06], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:42:12,983] [INFO] [timer.py:260:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=1.3607408732843125, CurrSamplesPerSec=1.1940400822786008, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0719, 'learning_rate': 1.0682492581602374e-06, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0719, 'learning_rate': 1.0385756676557865e-06, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0613, 'learning_rate': 1.0089020771513354e-06, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0648, 'learning_rate': 9.792284866468842e-07, 'epoch': 0.95}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0657, 'learning_rate': 9.495548961424333e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0713, 'learning_rate': 9.198813056379823e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0765, 'learning_rate': 8.902077151335312e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0706, 'learning_rate': 8.605341246290802e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0693, 'learning_rate': 8.308605341246291e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0654, 'learning_rate': 8.011869436201782e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:44:28,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=2, lr=[7.71513353115727e-07], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:44:28,057] [INFO] [timer.py:260:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=1.357637750722591, CurrSamplesPerSec=1.1858972080969576, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0629, 'learning_rate': 7.71513353115727e-07, 'epoch': 0.96}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0715, 'learning_rate': 7.41839762611276e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0724, 'learning_rate': 7.12166172106825e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0646, 'learning_rate': 6.82492581602374e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0669, 'learning_rate': 6.528189910979229e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0667, 'learning_rate': 6.231454005934719e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0676, 'learning_rate': 5.934718100890209e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0678, 'learning_rate': 5.637982195845697e-07, 'epoch': 0.97}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0689, 'learning_rate': 5.341246290801187e-07, 'epoch': 0.98}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0625, 'learning_rate': 5.044510385756677e-07, 'epoch': 0.98}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:46:22,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=2, lr=[4.7477744807121665e-07], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:46:22,189] [INFO] [timer.py:260:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=1.3583000423940665, CurrSamplesPerSec=1.3506495262965912, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0669, 'learning_rate': 4.7477744807121665e-07, 'epoch': 0.98}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0677, 'learning_rate': 4.451038575667656e-07, 'epoch': 0.98}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0708, 'learning_rate': 4.1543026706231454e-07, 'epoch': 0.98}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0734, 'learning_rate': 3.857566765578635e-07, 'epoch': 0.98}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0651, 'learning_rate': 3.560830860534125e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0721, 'learning_rate': 3.2640949554896146e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0739, 'learning_rate': 2.9673590504451043e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.07, 'learning_rate': 2.6706231454005935e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0713, 'learning_rate': 2.3738872403560833e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.072, 'learning_rate': 2.0771513353115727e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:48:27,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=2, lr=[1.7804154302670624e-07], mom=[[0.9, 0.999]]
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:48:27,671] [INFO] [timer.py:260:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=1.3569860653970132, CurrSamplesPerSec=1.2611468068325433, MemAllocated=0.21GB, MaxMemAllocated=8.98GB
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0649, 'learning_rate': 1.7804154302670624e-07, 'epoch': 0.99}




[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0679, 'learning_rate': 1.4836795252225522e-07, 'epoch': 1.0}
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0681, 'learning_rate': 1.1869436201780416e-07, 'epoch': 1.0}
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0695, 'learning_rate': 8.902077151335312e-08, 'epoch': 1.0}
[36m(RayTrainWorker pid=266237)[0m {'loss': 0.0658, 'learning_rate': 5.934718100890208e-08, 'epoch': 1.0}


[36m(RayTrainWorker pid=266237)[0m Saving model checkpoint to output/checkpoint-674
[36m(RayTrainWorker pid=266237)[0m Configuration saved in output/checkpoint-674/config.json
[36m(RayTrainWorker pid=266237)[0m Configuration saved in output/checkpoint-674/generation_config.json
[36m(RayTrainWorker pid=266237)[0m Model weights saved in output/checkpoint-674/pytorch_model.bin
[36m(RayTrainWorker pid=266237)[0m tokenizer config file saved in output/checkpoint-674/tokenizer_config.json
[36m(RayTrainWorker pid=266237)[0m Special tokens file saved in output/checkpoint-674/special_tokens_map.json


[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:49:23,778] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step674 is about to be saved!
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:49:23,779] [INFO] [engine.py:3528:save_16bit_model] Saving model weights to output/checkpoint-674/pytorch_model.bin, tag: global_step674
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:49:23,779] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/checkpoint-674/pytorch_model.bin...
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:49:31,619] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/checkpoint-674/pytorch_model.bin.
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:49:31,619] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step674 is ready now!
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:49:31,628] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step674 is about to be saved!
[36m(R



[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:50:02,245] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/checkpoint-674/global_step674/zero_pp_rank_0_mp_rank_00_optim_states.pt.
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:50:02,246] [INFO] [engine.py:3417:_save_zero_checkpoint] zero checkpoint saved output/checkpoint-674/global_step674/zero_pp_rank_0_mp_rank_00_optim_states.pt
[36m(RayTrainWorker pid=266237)[0m [2024-01-24 11:50:04,484] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step674 is ready now!


[36m(RayTrainWorker pid=266238)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/models/TorchTrainer_2024-01-24_09-34-24/TorchTrainer_c3062_00000_0_2024-01-24_09-34-25/checkpoint_000000)
[36m(RayTrainWorker pid=266237)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/models/TorchTrainer_2024-01-24_09-34-24/TorchTrainer_c3062_00000_0_2024-01-24_09-34-25/checkpoint_000000)


[36m(RayTrainWorker pid=266237)[0m {'train_runtime': 8178.4812, 'train_samples_per_second': 1.319, 'train_steps_per_second': 0.082, 'train_loss': 0.10651105660005564, 'epoch': 1.0}


[36m(RayTrainWorker pid=266237)[0m 
[36m(RayTrainWorker pid=266237)[0m 
[36m(RayTrainWorker pid=266237)[0m Training completed. Do not forget to share your model on huggingface.co/models =)
[36m(RayTrainWorker pid=266237)[0m 
[36m(RayTrainWorker pid=266237)[0m 


[36m(RayTrainWorker pid=266237)[0m Adam Optimizer #0 is created with AVX512 arithmetic capability.
[36m(RayTrainWorker pid=266237)[0m Config: alpha=0.000020, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1


2024-01-24 11:56:20,563	INFO tune.py:1042 -- Total run time: 8515.61 seconds (8294.29 seconds for the tuning loop).


In [11]:
checkpoint = results.checkpoint
checkpoint

Checkpoint(filesystem=local, path=/models/TorchTrainer_2024-01-24_09-34-24/TorchTrainer_c3062_00000_0_2024-01-24_09-34-25/checkpoint_000000)

In [13]:
checkpoint.to_directory('new_models')

'new_models'

In [14]:
from transformers import pipeline, AutoTokenizer, GPTJForCausalLM

model = GPTJForCausalLM.from_pretrained("new_models/checkpoint")
tokenizer = AutoTokenizer.from_pretrained("new_models/checkpoint")
pipe = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    torch_dtype=torch.float16,
    device_map="auto",
)

  return self.fget.__get__(instance, owner)()


In [15]:
for sentence in pipe(["Romeo and Juliet", "Romeo", "Juliet"], do_sample=True, min_length=20):
    print(sentence)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Romeo and Juliet doff their names for mine,--or, if you can, my office;'}]
[{'generated_text': 'Romeo is slain. What of that? he did but owe thy life, my wife,'}]
[{'generated_text': 'Julietta, my child is dead to-night, she said; that she was dead.'}]
