# Using Huggingface Transformers with Tune

# Required Packages and Variables

In [None]:
!pip install -q transformers[torch] "ray[tune]"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
"""
This example is uses the official
huggingface transformers `hyperparameter_search` API.
"""
import os
import pandas as pd

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.examples.pbt_transformers.utils import (
    download_data,
    build_compute_metrics_fn,
)
from ray.tune.schedulers import PopulationBasedTraining
from transformers import (
    glue_tasks_num_labels,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    GlueDataset,
    GlueDataTrainingArguments,
    TrainingArguments,
)

In [None]:
# glue_tasks_num_labels = {
    # "cola": 2,        # CoLA evaluates the grammaticality of sentences by determining whether they are linguistically acceptable or not.
    #                     Example: Sentence: “She eats pizza.” Label: Acceptable ||| Sentence: “Eats she pizza.” Label: Unacceptable
    #                     Use Case: CoLA helps LLMs understand syntactic structures and grammatical correctness.
    # "mnli": 3,        # MNLI assesses textual entailment, determining whether a hypothesis can be inferred from a given premise.
    #                     Example: Premise: “The cat chased the mouse.” Hypothesis: “The mouse was caught.” Label: Entailment
    #                     Use Case: MNLI is crucial for applications like question answering and information retrieval.
    # "mrpc": 2,        # MRPC focuses on paraphrase identification, distinguishing between sentences with similar meanings.
    #                     Example: Sentence 1: “The sun rises in the east.” Sentence 2: “The east sees the sun rise.” Label: Paraphrase
    #                     Use Case: MRPC aids LLMs in understanding semantic equivalence.
    # "sst-2": 2,       # SST-2 classifies sentences as positive or negative sentiment.
    #                     Example: Sentence: “The movie was fantastic!” Label: Positive
    #                     Use Case: SST-2 enables sentiment analysis in various applications.
    # "sts-b": 1,       # STS-B quantifies the semantic similarity between sentence pairs.
    #                     Example: Sentence 1: “The cat is on the mat.” Sentence 2: “The mat holds the cat.” Similarity Score: 4.5 (on a scale of 1 to 5)
    #                     Use Case: STS-B aids in measuring semantic relatedness.
    # "qqp": 2,         # QQP identifies duplicate question pairs.
    #                     Example: Question 1: “How does photosynthesis work?” Question 2: “What is the process of photosynthesis?” Label: Duplicate
    #                     Use Case: QQP assists in question matching and duplicate detection.
    # "qnli": 2,        # QNLI is similar to MNLI but focuses on question answering.
    #                     Example: Premise: “The cat chased the mouse.” Hypothesis: “Did the cat catch the mouse?” Label: Entailment
    #                     Use Case: QNLI supports question-answering systems.
    # "rte": 2,         # RTE determines whether a hypothesis logically follows from a given premise.
    #                     Example: Premise: “The cat chased the mouse.” Hypothesis: “The mouse escaped.” (entailment).
    #                     Use Case: RTE is essential for understanding logical relationships.
    # "wnli": 2,        # WNLI involves resolving pronoun references in ambiguous sentences.
    #                     Example: Sentence: “The cat chased the dog. It was fast.” Label: Entailment
    #                     Use Case: WNLI helps LLMs handle coreference resolution.
    # }

In [None]:
smoke_test = False       # if True: use dummy data
num_samples=8           # number of trials
gpus_per_trial=1

# Data Preparing

In [None]:
data_dir_name = "./data" if not smoke_test else "./test_data"
data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
if not os.path.exists(data_dir):
    os.mkdir(data_dir, 0o755)

task_name = "rte"
task_data_dir = os.path.join(data_dir, task_name.upper())
num_labels = glue_tasks_num_labels[task_name]                 # number of output classes(labels)

# Tokenizer & Model With Config

In [None]:
model_name = (
    "bert-base-uncased" if not smoke_test else "sshleifer/tiny-distilroberta-base"
)

config = AutoConfig.from_pretrained(                              # from transformers
    model_name, num_labels=num_labels, finetuning_task=task_name
)

# Download and cache tokenizer, model, and features
print("Downloading and caching Tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Triggers tokenizer download to cache
print("Downloading and caching pre-trained model")
AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading and caching Tokenizer


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading and caching pre-trained model


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config,
    )

# Dataset (GLUE)

In [None]:
# Download data.
download_data(task_name, data_dir)

data_args = GlueDataTrainingArguments(task_name=task_name, data_dir=task_data_dir)

train_dataset = GlueDataset(
    data_args, tokenizer=tokenizer, mode="train", cache_dir=task_data_dir
)
eval_dataset = GlueDataset(
    data_args, tokenizer=tokenizer, mode="dev", cache_dir=task_data_dir
)


Downloading dataset.
Data already exists. Using downloaded data for task rte from /content/data


# Training

In [None]:
training_args = TrainingArguments(
    output_dir=".",
    learning_rate=1e-5,  # config
    do_train=True,
    do_eval=True,
    no_cuda=gpus_per_trial <= 0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=2,  # config
    max_steps=-1,
    per_device_train_batch_size=16,  # config
    per_device_eval_batch_size=16,  # config
    warmup_steps=0,
    weight_decay=0.1,  # config
    logging_dir="./logs",
    skip_memory_metrics=True,
    report_to="none",
)
# Defining the training function
trainer = Trainer(
    model_init=get_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=build_compute_metrics_fn(task_name),
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Hyperparameter Tuning

In [None]:
# Defining the search space
tune_config = {
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "num_train_epochs": tune.choice([2, 3, 4, 5]),
    "max_steps": 1 if smoke_test else -1,  # Used for smoke test.
}

# Defining the scheduler
scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_acc",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": [16, 32, 64],
    },
)

# Defining the reporter
# Report table tp print in output
reporter = CLIReporter(
    parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs",
    },
    metric_columns=["eval_acc", "eval_loss", "epoch", "training_iteration"],
)

# Starting the distributed hyperparameter tuning process
trainer.hyperparameter_search(
    hp_space=lambda _: tune_config,
    backend="ray",
    n_trials=num_samples,
    resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr="training_iteration",
    stop={"training_iteration": 1} if smoke_test else None,
    progress_reporter=reporter,
    storage_path="~/ray_results/",
    name="tune_transformer_pbt",
    log_to_file=True,
)



+----------------------------------------------------------+
| Configuration for experiment     tune_transformer_pbt    |
+----------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator   |
| Scheduler                        PopulationBasedTraining |
| Number of trials                 8                       |
+----------------------------------------------------------+

View detailed results here: /root/ray_results/tune_transformer_pbt
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/driver_artifacts`

Trial status: 8 PENDING
Current time: 2024-06-09 14:18:05. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T4)
+--------------------------------------------------------+
| Trial name               status       num_train_epochs |
+------------------------------------------

[36m(_objective pid=5968)[0m 2024-06-09 14:18:18.280726: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=5968)[0m 2024-06-09 14:18:18.280795: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=5968)[0m 2024-06-09 14:18:18.287489: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=5968)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=5968)[0m You should probably TRAIN th


Trial status: 1 RUNNING | 7 PENDING
Current time: 2024-06-09 14:18:35. Total running time: 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+--------------------------------------------------------+
| Trial name               status       num_train_epochs |
+--------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4 |
| _objective_165d9_00001   PENDING                     5 |
| _objective_165d9_00002   PENDING                     4 |
| _objective_165d9_00003   PENDING                     2 |
| _objective_165d9_00004   PENDING                     3 |
| _objective_165d9_00005   PENDING                     3 |
| _objective_165d9_00006   PENDING                     2 |
| _objective_165d9_00007   PENDING                     2 |
+--------------------------------------------------------+


  5%|▌         | 16/312 [00:11<03:02,  1.62it/s]
  5%|▌         | 17/312 [00:12<03:02,  1.62it/s]
  6%|▌         | 18/312 [00:12<03:00,  1.63it/s]
  6%|▌         | 19/312 [00:13<02:59,  1.63it/s]
  6%|▋         | 20/312 [00:14<02:58,  1.64it/s]
  7%|▋         | 21/312 [00:14<02:59,  1.62it/s]
  7%|▋         | 22/312 [00:15<02:58,  1.63it/s]
  7%|▋         | 23/312 [00:15<02:58,  1.62it/s]
  8%|▊         | 24/312 [00:16<02:57,  1.62it/s]
  8%|▊         | 25/312 [00:17<02:57,  1.62it/s]
  8%|▊         | 26/312 [00:17<02:56,  1.62it/s]
  9%|▊         | 27/312 [00:18<02:56,  1.62it/s]
  9%|▉         | 28/312 [00:19<02:56,  1.61it/s]
  9%|▉         | 29/312 [00:19<02:55,  1.61it/s]
 10%|▉         | 30/312 [00:20<02:55,  1.61it/s]
 10%|▉         | 31/312 [00:20<02:54,  1.61it/s]
 10%|█         | 32/312 [00:21<02:54,  1.61it/s]
 11%|█         | 33/312 [00:22<02:54,  1.60it/s]
 11%|█         | 34/312 [00:22<02:53,  1.61it/s]
 11%|█         | 35/312 [00:23<02:53,  1.60it/s]
 12%|█▏        | 36/

Trial status: 1 RUNNING | 7 PENDING
Current time: 2024-06-09 14:19:05. Total running time: 1min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+--------------------------------------------------------+
| Trial name               status       num_train_epochs |
+--------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4 |
| _objective_165d9_00001   PENDING                     5 |
| _objective_165d9_00002   PENDING                     4 |
| _objective_165d9_00003   PENDING                     2 |
| _objective_165d9_00004   PENDING                     3 |
| _objective_165d9_00005   PENDING                     3 |
| _objective_165d9_00006   PENDING                     2 |
| _objective_165d9_00007   PENDING                     2 |
+--------------------------------------------------------+


 20%|██        | 63/312 [00:41<02:43,  1.52it/s]
 21%|██        | 64/312 [00:42<02:43,  1.52it/s]
 21%|██        | 65/312 [00:42<02:41,  1.53it/s]
 21%|██        | 66/312 [00:43<02:40,  1.53it/s]
 21%|██▏       | 67/312 [00:44<02:40,  1.53it/s]
 22%|██▏       | 68/312 [00:44<02:39,  1.53it/s]
 22%|██▏       | 69/312 [00:45<02:39,  1.52it/s]
 22%|██▏       | 70/312 [00:46<02:39,  1.52it/s]
 23%|██▎       | 71/312 [00:46<02:39,  1.51it/s]
 23%|██▎       | 72/312 [00:47<02:39,  1.51it/s]
 23%|██▎       | 73/312 [00:48<02:38,  1.51it/s]
 24%|██▎       | 74/312 [00:48<02:38,  1.50it/s]
 24%|██▍       | 75/312 [00:49<02:37,  1.50it/s]
 24%|██▍       | 76/312 [00:50<02:37,  1.50it/s]
 25%|██▍       | 77/312 [00:50<02:36,  1.50it/s]
 25%|██▌       | 78/312 [00:51<02:27,  1.59it/s]
[36m(_objective pid=5968)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=5968)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  9.23it/s][A
[36m(_objective pid=5968)[0m 
 33%|███▎      | 3/9 [00:00<00

[36m(_objective pid=5968)[0m {'eval_loss': 0.691601037979126, 'eval_acc': 0.5487364620938628, 'eval_runtime': 1.9856, 'eval_samples_per_second': 139.501, 'eval_steps_per_second': 4.533, 'epoch': 1.0}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00000 finished iteration 1 at 2024-06-09 14:19:27. Total running time: 1min 22s
+-----------------------------------------------------------+
| Trial _objective_165d9_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                 70.96711 |
| time_total_s                                     70.96711 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_acc                                          0.54874 |
| eval_loss                                          0.6916 |
| eval_runtime                                       1.9856 |
| eval_samples_per_second                           139.501 |
| eval_steps_per_second                               4.533 |
| objective                                         0.54874 |
+---------------------------

[36m(_objective pid=5968)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00001 started with configuration:
+------------------------------------------------+
| Trial _objective_165d9_00001 config            |
+------------------------------------------------+
| learning_rate                            2e-05 |
| max_steps                                   -1 |
| num_train_epochs                             5 |
| per_device_eval_batch_size                  32 |
| per_device_train_batch_size                 32 |
| weight_decay                            0.1796 |
+------------------------------------------------+


[36m(_objective pid=6369)[0m 2024-06-09 14:19:34.643375: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=6369)[0m 2024-06-09 14:19:34.643428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=6369)[0m 2024-06-09 14:19:34.644760: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Trial status: 1 PAUSED | 1 RUNNING | 6 PENDING
Current time: 2024-06-09 14:19:35. Total running time: 1min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00002   PEND

[36m(_objective pid=6369)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=6369)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/390 [00:00<?, ?it/s]
  0%|          | 1/390 [00:01<08:52,  1.37s/it]
  1%|          | 2/390 [00:01<06:00,  1.08it/s]
  1%|          | 3/390 [00:02<05:17,  1.22it/s]
  1%|          | 4/390 [00:03<04:53,  1.32it/s]
  1%|▏         | 5/390 [00:04<04:41,  1.37it/s]
  2%|▏         | 6/390 [00:04<04:34,  1.40it/s]
  2%|▏         | 7/390 [00:05<04:29,  1.42it/s]
  2%|▏         | 8/390 [00:06<04:25,  1.44it/s]
  2%|▏         | 9/390 [00:06<04:23,  1.45it/s]
  3%|▎         | 10/390 [00:07<04:20,  1.46it/s]
  3%|▎         | 11/390 [00:08<04:19,  1.46it/s]
  3%|▎         | 12/390 [00:08<04:18,  1.46it/s]
  3%|▎         | 13/390 [0

Trial status: 1 PAUSED | 1 RUNNING | 6 PENDING
Current time: 2024-06-09 14:20:05. Total running time: 2min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00002   PENDIN

 10%|▉         | 38/390 [00:27<04:13,  1.39it/s]
 10%|█         | 39/390 [00:27<04:12,  1.39it/s]
 10%|█         | 40/390 [00:28<04:12,  1.39it/s]
 11%|█         | 41/390 [00:29<04:12,  1.38it/s]
 11%|█         | 42/390 [00:30<04:10,  1.39it/s]
 11%|█         | 43/390 [00:30<04:10,  1.39it/s]
 11%|█▏        | 44/390 [00:31<04:10,  1.38it/s]
 12%|█▏        | 45/390 [00:32<04:08,  1.39it/s]
 12%|█▏        | 46/390 [00:32<04:09,  1.38it/s]
 12%|█▏        | 47/390 [00:33<04:08,  1.38it/s]
 12%|█▏        | 48/390 [00:34<04:08,  1.38it/s]
 13%|█▎        | 49/390 [00:35<04:06,  1.38it/s]
 13%|█▎        | 50/390 [00:35<04:05,  1.38it/s]
 13%|█▎        | 51/390 [00:36<04:05,  1.38it/s]
 13%|█▎        | 52/390 [00:37<04:02,  1.39it/s]
 14%|█▎        | 53/390 [00:38<04:02,  1.39it/s]
 14%|█▍        | 54/390 [00:38<04:00,  1.40it/s]
 14%|█▍        | 55/390 [00:39<03:59,  1.40it/s]
 14%|█▍        | 56/390 [00:40<03:58,  1.40it/s]
 15%|█▍        | 57/390 [00:40<03:56,  1.41it/s]
 15%|█▍        | 58/

Trial status: 1 PAUSED | 1 RUNNING | 6 PENDING
Current time: 2024-06-09 14:20:35. Total running time: 2min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00002   PENDI

[36m(_objective pid=6369)[0m 
 89%|████████▉ | 8/9 [00:01<00:00,  4.59it/s][A
[36m(_objective pid=6369)[0m 
                                                
 20%|██        | 78/390 [00:57<03:22,  1.54it/s]
100%|██████████| 9/9 [00:01<00:00,  4.95it/s][A
                                             [A


[36m(_objective pid=6369)[0m {'eval_loss': 0.6899483799934387, 'eval_acc': 0.5054151624548736, 'eval_runtime': 2.0134, 'eval_samples_per_second': 137.579, 'eval_steps_per_second': 4.47, 'epoch': 1.0}


2024-06-09 14:20:46,455	INFO pbt.py:716 -- [pbt]: no checkpoint for trial _objective_165d9_00000. Skip exploit for Trial _objective_165d9_00001
[36m(_objective pid=6369)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00001 finished iteration 1 at 2024-06-09 14:20:46. Total running time: 2min 41s
+-----------------------------------------------------------+
| Trial _objective_165d9_00001 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                 72.85864 |
| time_total_s                                     72.85864 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_acc                                          0.50542 |
| eval_loss                                         0.68995 |
| eval_runtime                                       2.0134 |
| eval_samples_per_second                           137.579 |
| eval_steps_per_second                                4.47 |
| objective                                         0.50542 |
+---------------------------

[36m(_objective pid=6369)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
[36m(_objective pid=6369)[0m  20%|██        | 78/390 [01:07<04:31,  1.15it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r 20%|██        | 78/390 [01:07<04:31,  1.15it/s]',) kwargs: {}
[36m(_objective pid=6369)[0m ValueError when calling 'flush' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_0


Trial _objective_165d9_00002 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00002 config             |
+-------------------------------------------------+
| learning_rate                             3e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.02999 |
+-------------------------------------------------+


[36m(_objective pid=6758)[0m 2024-06-09 14:20:53.243096: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=6758)[0m 2024-06-09 14:20:53.243141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=6758)[0m 2024-06-09 14:20:53.244557: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=6758)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=6758)[0m You should probably TRAIN th


Trial status: 2 PAUSED | 1 RUNNING | 5 PENDING
Current time: 2024-06-09 14:21:05. Total running time: 3min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE

  4%|▍         | 12/312 [00:09<03:20,  1.49it/s]
  4%|▍         | 13/312 [00:09<03:20,  1.49it/s]
  4%|▍         | 14/312 [00:10<03:19,  1.49it/s]
  5%|▍         | 15/312 [00:11<03:19,  1.49it/s]
  5%|▌         | 16/312 [00:11<03:18,  1.49it/s]
  5%|▌         | 17/312 [00:12<03:18,  1.49it/s]
  6%|▌         | 18/312 [00:13<03:18,  1.48it/s]
  6%|▌         | 19/312 [00:13<03:17,  1.48it/s]
  6%|▋         | 20/312 [00:14<03:16,  1.48it/s]
  7%|▋         | 21/312 [00:15<03:17,  1.47it/s]
  7%|▋         | 22/312 [00:15<03:17,  1.47it/s]
  7%|▋         | 23/312 [00:16<03:17,  1.46it/s]
  8%|▊         | 24/312 [00:17<03:17,  1.46it/s]
  8%|▊         | 25/312 [00:17<03:16,  1.46it/s]
  8%|▊         | 26/312 [00:18<03:16,  1.46it/s]
  9%|▊         | 27/312 [00:19<03:16,  1.45it/s]
  9%|▉         | 28/312 [00:19<03:16,  1.44it/s]
  9%|▉         | 29/312 [00:20<03:25,  1.38it/s]
 10%|▉         | 30/312 [00:21<03:26,  1.37it/s]
 10%|▉         | 31/312 [00:22<03:23,  1.38it/s]
 10%|█         | 32/

Trial status: 2 PAUSED | 1 RUNNING | 5 PENDING
Current time: 2024-06-09 14:21:35. Total running time: 3min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE

 18%|█▊        | 55/312 [00:39<02:59,  1.43it/s]
 18%|█▊        | 56/312 [00:39<02:58,  1.43it/s]
 18%|█▊        | 57/312 [00:40<02:57,  1.43it/s]
 19%|█▊        | 58/312 [00:41<02:57,  1.43it/s]
 19%|█▉        | 59/312 [00:41<02:55,  1.44it/s]
 19%|█▉        | 60/312 [00:42<02:54,  1.45it/s]
 20%|█▉        | 61/312 [00:43<02:54,  1.44it/s]
 20%|█▉        | 62/312 [00:43<02:53,  1.44it/s]
 20%|██        | 63/312 [00:44<02:52,  1.44it/s]
 21%|██        | 64/312 [00:45<02:52,  1.44it/s]
 21%|██        | 65/312 [00:45<02:50,  1.45it/s]
 21%|██        | 66/312 [00:46<02:50,  1.44it/s]
 21%|██▏       | 67/312 [00:47<02:49,  1.45it/s]
 22%|██▏       | 68/312 [00:48<02:48,  1.44it/s]
 22%|██▏       | 69/312 [00:48<02:48,  1.44it/s]
 22%|██▏       | 70/312 [00:49<02:47,  1.45it/s]
 23%|██▎       | 71/312 [00:50<02:45,  1.45it/s]
 23%|██▎       | 72/312 [00:50<02:44,  1.46it/s]
 23%|██▎       | 73/312 [00:51<02:43,  1.46it/s]
 24%|██▎       | 74/312 [00:52<02:42,  1.46it/s]
 24%|██▍       | 75/

[36m(_objective pid=6758)[0m {'eval_loss': 0.6723822951316833, 'eval_acc': 0.5812274368231047, 'eval_runtime': 1.989, 'eval_samples_per_second': 139.269, 'eval_steps_per_second': 4.525, 'epoch': 1.0}


[36m(_objective pid=6758)[0m 
[36m(_objective pid=6758)[0m                                                 
[36m(_objective pid=6758)[0m                                              [A 25%|██▌       | 78/312 [00:56<02:31,  1.55it/s]
[36m(_objective pid=6758)[0m 100%|██████████| 9/9 [00:01<00:00,  4.98it/s][A
[36m(_objective pid=6758)[0m                                              [A


Trial status: 2 PAUSED | 1 RUNNING | 5 PENDING
Current time: 2024-06-09 14:22:05. Total running time: 4min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSED

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00002 finished iteration 1 at 2024-06-09 14:22:18. Total running time: 4min 12s
+-----------------------------------------------------------+
| Trial _objective_165d9_00002 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                   85.983 |
| time_total_s                                       85.983 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_acc                                          0.58123 |
| eval_loss                                         0.67238 |
| eval_runtime                                        1.989 |
| eval_samples_per_second                           139.269 |
| eval_steps_per_second                               4.525 |
| objective                                         0.58123 |
+---------------------------





Trial _objective_165d9_00003 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00003 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              2 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.18033 |
+-------------------------------------------------+


[36m(_objective pid=7183)[0m 2024-06-09 14:22:23.282115: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=7183)[0m 2024-06-09 14:22:23.282167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=7183)[0m 2024-06-09 14:22:23.283542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=7183)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=7183)[0m You should probably TRAIN th


Trial status: 3 PAUSED | 1 RUNNING | 4 PENDING
Current time: 2024-06-09 14:22:36. Total running time: 4min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUS

  8%|▊         | 12/156 [00:08<01:32,  1.56it/s]
  8%|▊         | 13/156 [00:09<01:31,  1.56it/s]
  9%|▉         | 14/156 [00:10<01:31,  1.56it/s]
 10%|▉         | 15/156 [00:10<01:30,  1.55it/s]
 10%|█         | 16/156 [00:11<01:30,  1.55it/s]
 11%|█         | 17/156 [00:11<01:29,  1.55it/s]
 12%|█▏        | 18/156 [00:12<01:29,  1.55it/s]
 12%|█▏        | 19/156 [00:13<01:28,  1.55it/s]
 13%|█▎        | 20/156 [00:13<01:28,  1.55it/s]
 13%|█▎        | 21/156 [00:14<01:27,  1.54it/s]
 14%|█▍        | 22/156 [00:15<01:27,  1.54it/s]
 15%|█▍        | 23/156 [00:15<01:26,  1.54it/s]
 15%|█▌        | 24/156 [00:16<01:25,  1.54it/s]
 16%|█▌        | 25/156 [00:17<01:25,  1.54it/s]
 17%|█▋        | 26/156 [00:17<01:24,  1.54it/s]
 17%|█▋        | 27/156 [00:18<01:23,  1.54it/s]
 18%|█▊        | 28/156 [00:19<01:23,  1.53it/s]
 19%|█▊        | 29/156 [00:19<01:23,  1.53it/s]
 19%|█▉        | 30/156 [00:20<01:22,  1.53it/s]
 20%|█▉        | 31/156 [00:21<01:21,  1.53it/s]
 21%|██        | 32/

Trial status: 3 PAUSED | 1 RUNNING | 4 PENDING
Current time: 2024-06-09 14:23:06. Total running time: 5min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSED

 37%|███▋      | 58/156 [00:38<01:04,  1.51it/s]
 38%|███▊      | 59/156 [00:39<01:04,  1.51it/s]
 38%|███▊      | 60/156 [00:40<01:03,  1.51it/s]
 39%|███▉      | 61/156 [00:40<01:03,  1.51it/s]
 40%|███▉      | 62/156 [00:41<01:02,  1.51it/s]
 40%|████      | 63/156 [00:42<01:01,  1.52it/s]
 41%|████      | 64/156 [00:42<01:00,  1.53it/s]
 42%|████▏     | 65/156 [00:43<00:59,  1.53it/s]
 42%|████▏     | 66/156 [00:44<00:59,  1.52it/s]
 43%|████▎     | 67/156 [00:44<00:58,  1.52it/s]
 44%|████▎     | 68/156 [00:45<00:57,  1.52it/s]
 44%|████▍     | 69/156 [00:46<00:57,  1.53it/s]
 45%|████▍     | 70/156 [00:46<00:56,  1.52it/s]
 46%|████▌     | 71/156 [00:47<00:55,  1.52it/s]
 46%|████▌     | 72/156 [00:48<00:55,  1.52it/s]
 47%|████▋     | 73/156 [00:48<00:54,  1.52it/s]
 47%|████▋     | 74/156 [00:49<00:53,  1.52it/s]
 48%|████▊     | 75/156 [00:50<00:53,  1.52it/s]
 49%|████▊     | 76/156 [00:50<00:52,  1.52it/s]
 49%|████▉     | 77/156 [00:51<00:51,  1.52it/s]
 50%|█████     | 78/

[36m(_objective pid=7183)[0m {'eval_loss': 0.6840927004814148, 'eval_acc': 0.5342960288808665, 'eval_runtime': 1.9279, 'eval_samples_per_second': 143.681, 'eval_steps_per_second': 4.668, 'epoch': 1.0}
Trial status: 3 PAUSED | 1 RUNNING | 4 PENDING
Current time: 2024-06-09 14:23:36. Total running time: 5min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2                                                                                     

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00003 finished iteration 1 at 2024-06-09 14:23:40. Total running time: 5min 34s
+-----------------------------------------------------------+
| Trial _objective_165d9_00003 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                 77.83854 |
| time_total_s                                     77.83854 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_acc                                           0.5343 |
| eval_loss                                         0.68409 |
| eval_runtime                                       1.9279 |
| eval_samples_per_second                           143.681 |
| eval_steps_per_second                               4.668 |
| objective                                          0.5343 |
+---------------------------

[36m(_objective pid=7183)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00003_3_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00004 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00004 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              3 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.01692 |
+-------------------------------------------------+


[36m(_objective pid=7578)[0m 2024-06-09 14:23:47.158225: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=7578)[0m 2024-06-09 14:23:47.158286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=7578)[0m 2024-06-09 14:23:47.160094: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=7578)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=7578)[0m You should probably TRAIN th


Trial status: 4 PAUSED | 1 RUNNING | 3 PENDING
Current time: 2024-06-09 14:24:06. Total running time: 6min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE

 10%|▉         | 23/234 [00:15<02:14,  1.56it/s]
 10%|█         | 24/234 [00:15<02:14,  1.56it/s]
 11%|█         | 25/234 [00:16<02:13,  1.57it/s]
 11%|█         | 26/234 [00:17<02:12,  1.57it/s]
 12%|█▏        | 27/234 [00:17<02:12,  1.56it/s]
 12%|█▏        | 28/234 [00:18<02:11,  1.56it/s]
 12%|█▏        | 29/234 [00:19<02:12,  1.55it/s]
 13%|█▎        | 30/234 [00:19<02:11,  1.56it/s]
 13%|█▎        | 31/234 [00:20<02:10,  1.55it/s]
 14%|█▎        | 32/234 [00:21<02:10,  1.55it/s]
 14%|█▍        | 33/234 [00:21<02:10,  1.54it/s]
 15%|█▍        | 34/234 [00:22<02:09,  1.55it/s]
 15%|█▍        | 35/234 [00:23<02:09,  1.54it/s]
 15%|█▌        | 36/234 [00:23<02:08,  1.54it/s]
 16%|█▌        | 37/234 [00:24<02:08,  1.54it/s]
 16%|█▌        | 38/234 [00:25<02:07,  1.54it/s]
 17%|█▋        | 39/234 [00:25<02:06,  1.54it/s]
 17%|█▋        | 40/234 [00:26<02:06,  1.53it/s]
 18%|█▊        | 41/234 [00:26<02:05,  1.53it/s]
 18%|█▊        | 42/234 [00:27<02:04,  1.54it/s]
 18%|█▊        | 43/

Trial status: 4 PAUSED | 1 RUNNING | 3 PENDING
Current time: 2024-06-09 14:24:36. Total running time: 6min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE

 29%|██▉       | 69/234 [00:45<01:48,  1.53it/s]
 30%|██▉       | 70/234 [00:45<01:47,  1.52it/s]
 30%|███       | 71/234 [00:46<01:47,  1.52it/s]
 31%|███       | 72/234 [00:47<01:46,  1.53it/s]
 31%|███       | 73/234 [00:47<01:45,  1.53it/s]
 32%|███▏      | 74/234 [00:48<01:44,  1.53it/s]
 32%|███▏      | 75/234 [00:49<01:44,  1.52it/s]
 32%|███▏      | 76/234 [00:49<01:44,  1.52it/s]
 33%|███▎      | 77/234 [00:50<01:43,  1.52it/s]
 33%|███▎      | 78/234 [00:51<01:37,  1.60it/s]
[36m(_objective pid=7578)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=7578)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  8.93it/s][A
[36m(_objective pid=7578)[0m 
 33%|███▎      | 3/9 [00:00<00:00,  6.39it/s][A
[36m(_objective pid=7578)[0m 
 44%|████▍     | 4/9 [00:00<00:00,  5.59it/s][A
[36m(_objective pid=7578)[0m 
 56%|█████▌    | 5/9 [00:00<00:00,  5.19it/s][A
[36m(_objective pid=7578)[0m 
 67%|██████▋   | 6/9 [00:01<00:00,  4.95it/s][A
[36m(_objective pid=7578)[0m 


[36m(_objective pid=7578)[0m {'eval_loss': 0.6398476362228394, 'eval_acc': 0.6678700361010831, 'eval_runtime': 1.9445, 'eval_samples_per_second': 142.451, 'eval_steps_per_second': 4.628, 'epoch': 1.0}
Trial status: 4 PAUSED | 1 RUNNING | 3 PENDING
Current time: 2024-06-09 14:25:06. Total running time: 7min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3                                                                                      

[36m(_objective pid=7578)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00005 started with configuration:
+------------------------------------------------+
| Trial _objective_165d9_00005 config            |
+------------------------------------------------+
| learning_rate                            2e-05 |
| max_steps                                   -1 |
| num_train_epochs                             3 |
| per_device_eval_batch_size                  32 |
| per_device_train_batch_size                 32 |
| weight_decay                            0.0637 |
+------------------------------------------------+


[36m(_objective pid=8042)[0m 2024-06-09 14:25:26.747065: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=8042)[0m 2024-06-09 14:25:26.747119: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=8042)[0m 2024-06-09 14:25:26.749103: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=8042)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=8042)[0m You should probably TRAIN th


Trial status: 5 PAUSED | 1 RUNNING | 2 PENDING
Current time: 2024-06-09 14:25:36. Total running time: 7min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUS

  3%|▎         | 7/234 [00:05<02:26,  1.55it/s]
  3%|▎         | 8/234 [00:05<02:23,  1.58it/s]
  4%|▍         | 9/234 [00:06<02:22,  1.58it/s]
  4%|▍         | 10/234 [00:06<02:20,  1.60it/s]
  5%|▍         | 11/234 [00:07<02:19,  1.60it/s]
  5%|▌         | 12/234 [00:08<02:18,  1.60it/s]
  6%|▌         | 13/234 [00:08<02:18,  1.60it/s]
  6%|▌         | 14/234 [00:09<02:17,  1.60it/s]
  6%|▋         | 15/234 [00:09<02:17,  1.60it/s]
  7%|▋         | 16/234 [00:10<02:16,  1.60it/s]
  7%|▋         | 17/234 [00:11<02:15,  1.60it/s]
  8%|▊         | 18/234 [00:11<02:15,  1.60it/s]
  8%|▊         | 19/234 [00:12<02:15,  1.59it/s]
  9%|▊         | 20/234 [00:13<02:14,  1.59it/s]
  9%|▉         | 21/234 [00:13<02:13,  1.60it/s]
  9%|▉         | 22/234 [00:14<02:12,  1.60it/s]
 10%|▉         | 23/234 [00:15<02:12,  1.60it/s]
 10%|█         | 24/234 [00:15<02:11,  1.60it/s]
 11%|█         | 25/234 [00:16<02:11,  1.59it/s]
 11%|█         | 26/234 [00:16<02:09,  1.60it/s]
 12%|█▏        | 27/234

Trial status: 5 PAUSED | 1 RUNNING | 2 PENDING
Current time: 2024-06-09 14:26:06. Total running time: 8min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSED

 23%|██▎       | 54/234 [00:34<01:55,  1.55it/s]
 24%|██▎       | 55/234 [00:35<01:55,  1.55it/s]
 24%|██▍       | 56/234 [00:35<01:54,  1.56it/s]
 24%|██▍       | 57/234 [00:36<01:54,  1.55it/s]
 25%|██▍       | 58/234 [00:37<01:53,  1.55it/s]
 25%|██▌       | 59/234 [00:37<01:52,  1.55it/s]
 26%|██▌       | 60/234 [00:38<01:51,  1.56it/s]
 26%|██▌       | 61/234 [00:39<01:51,  1.55it/s]
 26%|██▋       | 62/234 [00:39<01:50,  1.55it/s]
 27%|██▋       | 63/234 [00:40<01:50,  1.55it/s]
 27%|██▋       | 64/234 [00:41<01:49,  1.55it/s]
 28%|██▊       | 65/234 [00:41<01:49,  1.55it/s]
 28%|██▊       | 66/234 [00:42<01:48,  1.54it/s]
 29%|██▊       | 67/234 [00:43<01:48,  1.54it/s]
 29%|██▉       | 68/234 [00:43<01:47,  1.55it/s]
 29%|██▉       | 69/234 [00:44<01:46,  1.55it/s]
 30%|██▉       | 70/234 [00:45<01:46,  1.55it/s]
 30%|███       | 71/234 [00:45<01:45,  1.55it/s]
 31%|███       | 72/234 [00:46<01:44,  1.55it/s]
 31%|███       | 73/234 [00:46<01:43,  1.55it/s]
 32%|███▏      | 74/

[36m(_objective pid=8042)[0m {'eval_loss': 0.6776441335678101, 'eval_acc': 0.5812274368231047, 'eval_runtime': 1.9146, 'eval_samples_per_second': 144.674, 'eval_steps_per_second': 4.701, 'epoch': 1.0}




Trial status: 5 PAUSED | 1 RUNNING | 2 PENDING
Current time: 2024-06-09 14:26:36. Total running time: 8min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE



Trial status: 5 PAUSED | 1 RUNNING | 2 PENDING
Current time: 2024-06-09 14:27:06. Total running time: 9min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSED



Trial status: 5 PAUSED | 1 RUNNING | 2 PENDING
Current time: 2024-06-09 14:27:36. Total running time: 9min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00005 finished iteration 1 at 2024-06-09 14:27:53. Total running time: 9min 48s
+-----------------------------------------------------------+
| Trial _objective_165d9_00005 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                148.07364 |
| time_total_s                                    148.07364 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_acc                                          0.58123 |
| eval_loss                                         0.67764 |
| eval_runtime                                       1.9146 |
| eval_samples_per_second                           144.674 |
| eval_steps_per_second                               4.701 |
| objective                                         0.58123 |
+---------------------------

[36m(_objective pid=8042)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00005_5_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00006 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00006 config             |
+-------------------------------------------------+
| learning_rate                             3e-05 |
| max_steps                                    -1 |
| num_train_epochs                              2 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.18524 |
+-------------------------------------------------+


[36m(_objective pid=8731)[0m 2024-06-09 14:28:01.107924: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=8731)[0m 2024-06-09 14:28:01.107978: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=8731)[0m 2024-06-09 14:28:01.109440: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=8731)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=8731)[0m You should probably TRAIN th


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:28:06. Total running time: 10min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUS

  1%|▏         | 2/156 [00:01<02:15,  1.14it/s]
  2%|▏         | 3/156 [00:02<01:55,  1.32it/s]
  3%|▎         | 4/156 [00:03<01:45,  1.45it/s]
  3%|▎         | 5/156 [00:03<01:39,  1.51it/s]
  4%|▍         | 6/156 [00:04<01:36,  1.55it/s]
  4%|▍         | 7/156 [00:04<01:34,  1.57it/s]
  5%|▌         | 8/156 [00:05<01:32,  1.60it/s]
  6%|▌         | 9/156 [00:06<01:31,  1.60it/s]
  6%|▋         | 10/156 [00:06<01:30,  1.61it/s]
  7%|▋         | 11/156 [00:07<01:29,  1.62it/s]
  8%|▊         | 12/156 [00:08<01:29,  1.62it/s]
  8%|▊         | 13/156 [00:08<01:28,  1.62it/s]
  9%|▉         | 14/156 [00:09<01:27,  1.62it/s]
 10%|▉         | 15/156 [00:09<01:26,  1.62it/s]
 10%|█         | 16/156 [00:10<01:26,  1.63it/s]
 11%|█         | 17/156 [00:11<01:25,  1.63it/s]
 12%|█▏        | 18/156 [00:11<01:24,  1.63it/s]
 12%|█▏        | 19/156 [00:12<01:23,  1.63it/s]
 13%|█▎        | 20/156 [00:12<01:23,  1.63it/s]
 13%|█▎        | 21/156 [00:13<01:22,  1.64it/s]
 14%|█▍        | 22/156 [00:

Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:28:36. Total running time: 10min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUS

 33%|███▎      | 51/156 [00:32<01:05,  1.61it/s]
 33%|███▎      | 52/156 [00:32<01:04,  1.61it/s]
 34%|███▍      | 53/156 [00:33<01:04,  1.61it/s]
 35%|███▍      | 54/156 [00:33<01:03,  1.60it/s]
 35%|███▌      | 55/156 [00:34<01:02,  1.61it/s]
 36%|███▌      | 56/156 [00:35<01:02,  1.60it/s]
 37%|███▋      | 57/156 [00:35<01:02,  1.60it/s]
 37%|███▋      | 58/156 [00:36<01:01,  1.59it/s]
 38%|███▊      | 59/156 [00:37<01:00,  1.60it/s]
 38%|███▊      | 60/156 [00:37<01:00,  1.60it/s]
 39%|███▉      | 61/156 [00:38<00:59,  1.59it/s]
 40%|███▉      | 62/156 [00:38<00:59,  1.59it/s]
 40%|████      | 63/156 [00:39<00:58,  1.59it/s]
 41%|████      | 64/156 [00:40<00:58,  1.58it/s]
 42%|████▏     | 65/156 [00:40<00:57,  1.58it/s]
 42%|████▏     | 66/156 [00:41<00:56,  1.59it/s]
 43%|████▎     | 67/156 [00:42<00:56,  1.59it/s]
 44%|████▎     | 68/156 [00:42<00:55,  1.59it/s]
 44%|████▍     | 69/156 [00:43<00:54,  1.59it/s]
 45%|████▍     | 70/156 [00:43<00:54,  1.59it/s]
 46%|████▌     | 71/

[36m(_objective pid=8731)[0m {'eval_loss': 0.6703218221664429, 'eval_acc': 0.5812274368231047, 'eval_runtime': 1.8513, 'eval_samples_per_second': 149.628, 'eval_steps_per_second': 4.862, 'epoch': 1.0}


[36m(_objective pid=8731)[0m 
[36m(_objective pid=8731)[0m                                                 
[36m(_objective pid=8731)[0m                                              [A 50%|█████     | 78/156 [00:50<00:46,  1.67it/s]
[36m(_objective pid=8731)[0m 100%|██████████| 9/9 [00:01<00:00,  5.36it/s][A
[36m(_objective pid=8731)[0m                                              [A


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:29:06. Total running time: 11min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2                                                                                                  |
| _objective_165d9_00000   PAUSED                      4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00001   PAUSE

[36m(_objective pid=8731)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00006_6_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00007 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00007 config             |
+-------------------------------------------------+
| learning_rate                             2e-05 |
| max_steps                                    -1 |
| num_train_epochs                              2 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.12958 |
+-------------------------------------------------+

Trial status: 1 PENDING | 6 PAUSED | 1 RUNNING
Current time: 2024-06-09 14:29:36. Total running time: 11min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total 

[36m(_objective pid=9185)[0m 2024-06-09 14:29:37.673711: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=9185)[0m 2024-06-09 14:29:37.673782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=9185)[0m 2024-06-09 14:29:37.675574: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=9185)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=9185)[0m You should probably TRAIN th

Trial status: 1 PENDING | 6 PAUSED | 1 RUNNING
Current time: 2024-06-09 14:30:06. Total running time: 12min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00007   RUNNING                     2                                                                                                  |
| _objective_165d9_00001   PAUSED                      5        1            72.8586      0.689948     0.505415           2.0134                  137.579 |
| _objective_165d9_00002   PAUSE

 23%|██▎       | 36/156 [00:23<01:19,  1.51it/s]
 24%|██▎       | 37/156 [00:24<01:19,  1.50it/s]
 24%|██▍       | 38/156 [00:25<01:18,  1.50it/s]
 25%|██▌       | 39/156 [00:25<01:18,  1.50it/s]
 26%|██▌       | 40/156 [00:26<01:17,  1.49it/s]
 26%|██▋       | 41/156 [00:27<01:17,  1.48it/s]
 27%|██▋       | 42/156 [00:28<01:16,  1.48it/s]
 28%|██▊       | 43/156 [00:28<01:16,  1.48it/s]
 28%|██▊       | 44/156 [00:29<01:15,  1.48it/s]
 29%|██▉       | 45/156 [00:30<01:15,  1.47it/s]
 29%|██▉       | 46/156 [00:30<01:15,  1.46it/s]
 30%|███       | 47/156 [00:31<01:14,  1.46it/s]
 31%|███       | 48/156 [00:32<01:14,  1.46it/s]
 31%|███▏      | 49/156 [00:32<01:13,  1.46it/s]
 32%|███▏      | 50/156 [00:33<01:12,  1.46it/s]
 33%|███▎      | 51/156 [00:34<01:11,  1.47it/s]
 33%|███▎      | 52/156 [00:34<01:11,  1.46it/s]
 34%|███▍      | 53/156 [00:35<01:10,  1.45it/s]
 35%|███▍      | 54/156 [00:36<01:10,  1.45it/s]
 35%|███▌      | 55/156 [00:36<01:09,  1.45it/s]
 36%|███▌      | 56/

Trial status: 1 PENDING | 6 PAUSED | 1 RUNNING
Current time: 2024-06-09 14:30:36. Total running time: 12min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00007   RUNNING                     2                                                                                                  |
| _objective_165d9_00001   PAUSED                      5        1            72.8586      0.689948     0.505415           2.0134                  137.579 |
| _objective_165d9_00002   PAUS

[36m(_objective pid=9185)[0m 
 44%|████▍     | 4/9 [00:00<00:00,  5.06it/s][A
[36m(_objective pid=9185)[0m 
 56%|█████▌    | 5/9 [00:00<00:00,  4.69it/s][A
[36m(_objective pid=9185)[0m 
 67%|██████▋   | 6/9 [00:01<00:00,  4.48it/s][A
[36m(_objective pid=9185)[0m 
 78%|███████▊  | 7/9 [00:01<00:00,  4.35it/s][A
[36m(_objective pid=9185)[0m 
 89%|████████▉ | 8/9 [00:01<00:00,  4.29it/s][A


[36m(_objective pid=9185)[0m {'eval_loss': 0.6817957758903503, 'eval_acc': 0.5090252707581228, 'eval_runtime': 2.1442, 'eval_samples_per_second': 129.185, 'eval_steps_per_second': 4.197, 'epoch': 1.0}


[36m(_objective pid=9185)[0m 
[36m(_objective pid=9185)[0m                                                 
[36m(_objective pid=9185)[0m                                              [A 50%|█████     | 78/156 [00:55<00:53,  1.46it/s]
[36m(_objective pid=9185)[0m 100%|██████████| 9/9 [00:01<00:00,  4.64it/s][A
[36m(_objective pid=9185)[0m                                              [A


Trial status: 1 PENDING | 6 PAUSED | 1 RUNNING
Current time: 2024-06-09 14:31:10. Total running time: 13min 4s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00007   RUNNING                     2                                                                                                  |
| _objective_165d9_00001   PAUSED                      5        1            72.8586      0.689948     0.505415           2.0134                  137.579 |
| _objective_165d9_00002   PAUSE

2024-06-09 14:31:15,847	INFO pbt.py:878 -- 

[PopulationBasedTraining] [Exploit] Cloning trial 165d9_00006 (score = 0.581227) into trial 165d9_00007 (score = 0.509025)

2024-06-09 14:31:15,849	INFO pbt.py:905 -- 

[PopulationBasedTraining] [Explore] Perturbed the hyperparameter config of trial165d9_00007:
per_device_train_batch_size : 32 --- (shift right) --> 64
weight_decay : 0.18524445288831495 --- (* 0.8) --> 0.14819556231065198
learning_rate : 3.446612641953124e-05 --- (resample) --> 3.4474115788895184e-05

[36m(_objective pid=9185)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00007_7_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000000)



Trial _objective_165d9_00007 finished iteration 1 at 2024-06-09 14:31:15. Total running time: 13min 10s
+-----------------------------------------------------------+
| Trial _objective_165d9_00007 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000000 |
| time_this_iter_s                                 99.41303 |
| time_total_s                                     99.41303 |
| training_iteration                                      1 |
| epoch                                                  1. |
| eval_acc                                          0.50903 |
| eval_loss                                          0.6818 |
| eval_runtime                                       2.1442 |
| eval_samples_per_second                           129.185 |
| eval_steps_per_second                               4.197 |
| objective                                         0.50903 |
+--------------------------

[36m(_objective pid=9185)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00007_7_num_train_epochs=2_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
[36m(_objective pid=9185)[0m  50%|█████     | 78/156 [01:33<01:33,  1.20s/it]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00007_7_num_train_epochs=2_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r 50%|█████     | 78/156 [01:33<01:33,  1.20s/it]',) kwargs: {}
[36m(_objective pid=9185)[0m ValueError when calling 'flush' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_0


Trial _objective_165d9_00000 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00000 config             |
+-------------------------------------------------+
| learning_rate                             2e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.23896 |
+-------------------------------------------------+


[36m(_objective pid=9664)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=9664)[0m 2024-06-09 14:31:23.754497: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=9664)[0m 2024-06-09 14:31:23.754556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=9664)[0m 2024-06-09 14:31:23.756376: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=9664)[0m Some weights 


Trial status: 1 RUNNING | 1 PENDING | 6 PAUSED
Current time: 2024-06-09 14:31:40. Total running time: 13min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00002   PAUSED                      4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00003   PAU

 28%|██▊       | 87/312 [00:07<00:26,  8.62it/s]
 29%|██▊       | 89/312 [00:08<00:33,  6.66it/s]
 29%|██▉       | 90/312 [00:09<00:38,  5.81it/s]
 29%|██▉       | 91/312 [00:09<00:44,  4.99it/s]
 29%|██▉       | 92/312 [00:10<00:52,  4.23it/s]
 30%|██▉       | 93/312 [00:11<01:01,  3.56it/s]
 30%|███       | 94/312 [00:11<01:12,  3.03it/s]
 30%|███       | 95/312 [00:12<01:23,  2.61it/s]
 31%|███       | 96/312 [00:13<01:34,  2.28it/s]
 31%|███       | 97/312 [00:14<01:45,  2.04it/s]
 31%|███▏      | 98/312 [00:14<01:55,  1.86it/s]
 32%|███▏      | 99/312 [00:15<02:03,  1.73it/s]
 32%|███▏      | 100/312 [00:16<02:10,  1.62it/s]
 32%|███▏      | 101/312 [00:16<02:15,  1.56it/s]
 33%|███▎      | 102/312 [00:17<02:19,  1.51it/s]
 33%|███▎      | 103/312 [00:18<02:22,  1.47it/s]
 33%|███▎      | 104/312 [00:19<02:24,  1.44it/s]
 34%|███▎      | 105/312 [00:19<02:25,  1.42it/s]
 34%|███▍      | 106/312 [00:20<02:26,  1.40it/s]
 34%|███▍      | 107/312 [00:21<02:27,  1.39it/s]
 35%|███▍   

Trial status: 1 RUNNING | 1 PENDING | 6 PAUSED
Current time: 2024-06-09 14:32:10. Total running time: 14min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00002   PAUSED                      4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00003   PAUSE

 41%|████      | 128/312 [00:36<02:12,  1.39it/s]
 41%|████▏     | 129/312 [00:37<02:11,  1.39it/s]
 42%|████▏     | 130/312 [00:38<02:10,  1.39it/s]
 42%|████▏     | 131/312 [00:38<02:09,  1.40it/s]
 42%|████▏     | 132/312 [00:39<02:08,  1.40it/s]
 43%|████▎     | 133/312 [00:40<02:07,  1.41it/s]
 43%|████▎     | 134/312 [00:40<02:05,  1.42it/s]
 43%|████▎     | 135/312 [00:41<02:04,  1.42it/s]
 44%|████▎     | 136/312 [00:42<02:04,  1.42it/s]
 44%|████▍     | 137/312 [00:42<02:02,  1.42it/s]
 44%|████▍     | 138/312 [00:43<02:02,  1.42it/s]
 45%|████▍     | 139/312 [00:44<02:01,  1.42it/s]
 45%|████▍     | 140/312 [00:45<02:00,  1.43it/s]
 45%|████▌     | 141/312 [00:45<01:59,  1.43it/s]
 46%|████▌     | 142/312 [00:46<01:59,  1.42it/s]
 46%|████▌     | 143/312 [00:47<01:58,  1.43it/s]
 46%|████▌     | 144/312 [00:47<01:56,  1.44it/s]
 46%|████▋     | 145/312 [00:48<01:56,  1.44it/s]
 47%|████▋     | 146/312 [00:49<01:55,  1.44it/s]
 47%|████▋     | 147/312 [00:49<01:54,  1.44it/s]


[36m(_objective pid=9664)[0m {'eval_loss': 0.6589797139167786, 'eval_acc': 0.6137184115523465, 'eval_runtime': 2.0216, 'eval_samples_per_second': 137.022, 'eval_steps_per_second': 4.452, 'epoch': 2.0}


[36m(_objective pid=9664)[0m 
[36m(_objective pid=9664)[0m                                                  
[36m(_objective pid=9664)[0m                                              [A 50%|█████     | 156/312 [00:58<01:41,  1.54it/s]
[36m(_objective pid=9664)[0m 100%|██████████| 9/9 [00:01<00:00,  4.93it/s][A
[36m(_objective pid=9664)[0m                                              [A


Trial status: 1 RUNNING | 1 PENDING | 6 PAUSED
Current time: 2024-06-09 14:32:40. Total running time: 14min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4        1            70.9671      0.691601     0.548736           1.9856                  139.501 |
| _objective_165d9_00002   PAUSED                      4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00003   PAUS

[36m(_objective pid=9664)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=9664)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
 50%|█████     | 156/312 [01:34<01:34,  1.66it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r 50%|█████     | 156/312 [01:34<01:34,  1.66it/s]',) kwargs: {}
[36m(_objective pid=9664)[0m Value


Trial status: 7 PAUSED | 1 PENDING
Current time: 2024-06-09 14:33:10. Total running time: 15min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00002   PAUSED                      4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00003   PAUSED          

[36m(_objective pid=10173)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=10173)[0m 2024-06-09 14:33:14.297911: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=10173)[0m 2024-06-09 14:33:14.297959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=10173)[0m 2024-06-09 14:33:14.299378: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=10173)[0m Some wei


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:33:40. Total running time: 15min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5        1            72.8586      0.689948     0.505415           2.0134                  137.579 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00003   PAU

 26%|██▌       | 100/390 [00:15<02:44,  1.76it/s]
 26%|██▌       | 101/390 [00:16<02:52,  1.68it/s]
 26%|██▌       | 102/390 [00:16<02:59,  1.60it/s]
 26%|██▋       | 103/390 [00:17<03:04,  1.55it/s]
 27%|██▋       | 104/390 [00:18<03:08,  1.52it/s]
 27%|██▋       | 105/390 [00:19<03:12,  1.48it/s]
 27%|██▋       | 106/390 [00:19<03:14,  1.46it/s]
 27%|██▋       | 107/390 [00:20<03:15,  1.44it/s]
 28%|██▊       | 108/390 [00:21<03:17,  1.43it/s]
 28%|██▊       | 109/390 [00:21<03:17,  1.43it/s]
 28%|██▊       | 110/390 [00:22<03:17,  1.42it/s]
 28%|██▊       | 111/390 [00:23<03:17,  1.41it/s]
 29%|██▊       | 112/390 [00:24<03:17,  1.41it/s]
 29%|██▉       | 113/390 [00:24<03:16,  1.41it/s]
 29%|██▉       | 114/390 [00:25<03:17,  1.40it/s]
 29%|██▉       | 115/390 [00:26<03:16,  1.40it/s]
 30%|██▉       | 116/390 [00:26<03:17,  1.39it/s]
 30%|███       | 117/390 [00:27<03:16,  1.39it/s]
 30%|███       | 118/390 [00:28<03:15,  1.39it/s]
 31%|███       | 119/390 [00:29<03:16,  1.38it/s]


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:34:10. Total running time: 16min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5        1            72.8586      0.689948     0.505415           2.0134                  137.579 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00003   PAUSE

 36%|███▋      | 142/390 [00:45<02:56,  1.40it/s]
 37%|███▋      | 143/390 [00:46<02:55,  1.41it/s]
 37%|███▋      | 144/390 [00:47<02:54,  1.41it/s]
 37%|███▋      | 145/390 [00:47<02:53,  1.41it/s]
 37%|███▋      | 146/390 [00:48<02:52,  1.41it/s]
 38%|███▊      | 147/390 [00:49<02:51,  1.41it/s]
 38%|███▊      | 148/390 [00:50<02:51,  1.41it/s]
 38%|███▊      | 149/390 [00:50<02:49,  1.42it/s]
 38%|███▊      | 150/390 [00:51<02:48,  1.43it/s]
 39%|███▊      | 151/390 [00:52<02:47,  1.42it/s]
 39%|███▉      | 152/390 [00:52<02:46,  1.43it/s]
 39%|███▉      | 153/390 [00:53<02:45,  1.43it/s]
 39%|███▉      | 154/390 [00:54<02:45,  1.43it/s]
 40%|███▉      | 155/390 [00:54<02:44,  1.43it/s]
 40%|████      | 156/390 [00:55<02:34,  1.51it/s]
[36m(_objective pid=10173)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=10173)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  8.66it/s][A
[36m(_objective pid=10173)[0m 
 33%|███▎      | 3/9 [00:00<00:00,  6.03it/s][A
[36m(_objec

[36m(_objective pid=10173)[0m {'eval_loss': 0.648919939994812, 'eval_acc': 0.631768953068592, 'eval_runtime': 2.0251, 'eval_samples_per_second': 136.787, 'eval_steps_per_second': 4.444, 'epoch': 2.0}
Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:34:40. Total running time: 16min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5        1            72.8586      0.689948     0.505415           2.0134             

[36m(_objective pid=10173)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=10173)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
 40%|████      | 156/390 [01:34<02:21,  1.66it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r 40%|████      | 156/390 [01:34<02:21,  1.66it/s]',) kwargs: {}
[36m(_objective pid=10173)[0m Va


Trial _objective_165d9_00002 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00002 config             |
+-------------------------------------------------+
| learning_rate                             3e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.02999 |
+-------------------------------------------------+


[36m(_objective pid=10682)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=10682)[0m 2024-06-09 14:35:06.601024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=10682)[0m 2024-06-09 14:35:06.601076: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=10682)[0m 2024-06-09 14:35:06.602464: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=10682)[0m Some wei


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:35:10. Total running time: 17min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

  0%|          | 0/312 [00:00<?, ?it/s]
 25%|██▌       | 79/312 [00:01<00:05, 46.58it/s]
 27%|██▋       | 84/312 [00:04<00:16, 13.43it/s]
 28%|██▊       | 87/312 [00:07<00:25,  8.78it/s]
 29%|██▊       | 89/312 [00:08<00:32,  6.80it/s]
 29%|██▉       | 90/312 [00:09<00:37,  5.93it/s]
 29%|██▉       | 91/312 [00:09<00:43,  5.09it/s]
 29%|██▉       | 92/312 [00:10<00:51,  4.31it/s]
 30%|██▉       | 93/312 [00:11<01:00,  3.65it/s]
 30%|███       | 94/312 [00:11<01:10,  3.10it/s]
 30%|███       | 95/312 [00:12<01:21,  2.66it/s]
 31%|███       | 96/312 [00:13<01:32,  2.33it/s]
 31%|███       | 97/312 [00:13<01:43,  2.08it/s]
 31%|███▏      | 98/312 [00:14<01:53,  1.89it/s]
 32%|███▏      | 99/312 [00:15<02:00,  1.76it/s]
 32%|███▏      | 100/312 [00:15<02:06,  1.67it/s]
 32%|███▏      | 101/312 [00:16<02:11,  1.61it/s]
 33%|███▎      | 102/312 [00:17<02:15,  1.55it/s]
 33%|███▎      | 103/312 [00:17<02:17,  1.52it/s]
 33%|███▎      | 104/312 [00:18<02:20,  1.48it/s]
 34%|███▎      | 105/312

Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:35:40. Total running time: 17min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

 36%|███▌      | 112/312 [00:24<02:21,  1.42it/s]
 36%|███▌      | 113/312 [00:25<02:20,  1.41it/s]
 37%|███▋      | 114/312 [00:25<02:20,  1.41it/s]
 37%|███▋      | 115/312 [00:26<02:20,  1.40it/s]
 37%|███▋      | 116/312 [00:27<02:20,  1.39it/s]
 38%|███▊      | 117/312 [00:27<02:20,  1.39it/s]
 38%|███▊      | 118/312 [00:28<02:19,  1.39it/s]
 38%|███▊      | 119/312 [00:29<02:19,  1.38it/s]
 38%|███▊      | 120/312 [00:30<02:20,  1.37it/s]
 39%|███▉      | 121/312 [00:30<02:19,  1.37it/s]
 39%|███▉      | 122/312 [00:31<02:18,  1.37it/s]
 39%|███▉      | 123/312 [00:32<02:18,  1.37it/s]
 40%|███▉      | 124/312 [00:33<02:16,  1.38it/s]
 40%|████      | 125/312 [00:33<02:16,  1.37it/s]
 40%|████      | 126/312 [00:34<02:15,  1.37it/s]
 41%|████      | 127/312 [00:35<02:16,  1.36it/s]
 41%|████      | 128/312 [00:36<02:15,  1.36it/s]
 41%|████▏     | 129/312 [00:36<02:14,  1.36it/s]
 42%|████▏     | 130/312 [00:37<02:13,  1.36it/s]
 42%|████▏     | 131/312 [00:38<02:12,  1.37it/s]


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:36:10. Total running time: 18min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4        1            85.983       0.672382     0.581227           1.989                   139.269 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUSE

 49%|████▉     | 154/312 [00:54<01:50,  1.43it/s]
 50%|████▉     | 155/312 [00:55<01:49,  1.44it/s]
 50%|█████     | 156/312 [00:55<01:42,  1.52it/s]
[36m(_objective pid=10682)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=10682)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  8.71it/s][A
[36m(_objective pid=10682)[0m 
 33%|███▎      | 3/9 [00:00<00:00,  6.11it/s][A
[36m(_objective pid=10682)[0m 
 44%|████▍     | 4/9 [00:00<00:00,  5.25it/s][A
[36m(_objective pid=10682)[0m 
 56%|█████▌    | 5/9 [00:00<00:00,  4.94it/s][A
[36m(_objective pid=10682)[0m 
 67%|██████▋   | 6/9 [00:01<00:00,  4.69it/s][A
[36m(_objective pid=10682)[0m 
 78%|███████▊  | 7/9 [00:01<00:00,  4.56it/s][A
[36m(_objective pid=10682)[0m 
 89%|████████▉ | 8/9 [00:01<00:00,  4.48it/s][A


[36m(_objective pid=10682)[0m {'eval_loss': 0.6292964220046997, 'eval_acc': 0.6606498194945848, 'eval_runtime': 2.0424, 'eval_samples_per_second': 135.626, 'eval_steps_per_second': 4.407, 'epoch': 2.0}


[36m(_objective pid=10682)[0m 
[36m(_objective pid=10682)[0m                                                  
[36m(_objective pid=10682)[0m                                              [A 50%|█████     | 156/312 [00:57<01:42,  1.52it/s]
[36m(_objective pid=10682)[0m 100%|██████████| 9/9 [00:01<00:00,  4.87it/s][A
[36m(_objective pid=10682)[0m                                              [A
[36m(_objective pid=10682)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00002 finished iteration 2 at 2024-06-09 14:36:40. Total running time: 18min 35s
+-----------------------------------------------------------+
| Trial _objective_165d9_00002 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000001 |
| time_this_iter_s                                 94.90498 |
| time_total_s                                    180.88798 |
| training_iteration                                      2 |
| epoch                                                  2. |
| eval_acc                                          0.66065 |
| eval_loss                                          0.6293 |
| eval_runtime                                       2.0424 |
| eval_samples_per_second                           135.626 |
| eval_steps_per_second                               4.407 |
| objective                                         0.66065 |
+--------------------------

[36m(_objective pid=10682)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
[36m(_objective pid=10682)[0m  50%|█████     | 156/312 [01:24<01:24,  1.85it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r 50%|█████     | 156/312 [01:24<01:24,  1.85it/s]',) kwargs: {}
[36m(_objective pid=10682)[0m ValueError when calling 'flush' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_16


Trial status: 7 PAUSED | 1 PENDING
Current time: 2024-06-09 14:36:40. Total running time: 18min 35s
Logical resource usage: 0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUSED                      5        2           178.333       0.64892      0.631769           2.0251                  136.787 |
| _objective_165d9_00002   PAUSED             

[36m(_objective pid=10682)[0m Exception ignored in atexit callback: <function shutdown at 0x79b862348a60>
[36m(_objective pid=10682)[0m Traceback (most recent call last):
[36m(_objective pid=10682)[0m   File "/usr/lib/python3.10/logging/__init__.py", line 2183, in shutdown
[36m(_objective pid=10682)[0m     h.close()
[36m(_objective pid=10682)[0m   File "/usr/local/lib/python3.10/dist-packages/absl/logging/__init__.py", line 944, in close
[36m(_objective pid=10682)[0m     self.stream.close()
[36m(_objective pid=10682)[0m AttributeError: 'Tee' object has no attribute 'close'



Trial _objective_165d9_00003 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00003 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              2 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.18033 |
+-------------------------------------------------+


[36m(_objective pid=11158)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00003_3_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=11158)[0m 2024-06-09 14:36:49.021829: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=11158)[0m 2024-06-09 14:36:49.021886: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=11158)[0m 2024-06-09 14:36:49.023178: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=11158)[0m Some wei


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:37:10. Total running time: 19min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2        1            77.8385      0.684093     0.534296           1.9279                  143.681 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

 61%|██████    | 95/156 [00:12<00:25,  2.36it/s]
 62%|██████▏   | 96/156 [00:13<00:28,  2.11it/s]
 62%|██████▏   | 97/156 [00:13<00:30,  1.92it/s]
 63%|██████▎   | 98/156 [00:14<00:32,  1.78it/s]
 63%|██████▎   | 99/156 [00:15<00:33,  1.68it/s]
 64%|██████▍   | 100/156 [00:15<00:34,  1.60it/s]
 65%|██████▍   | 101/156 [00:16<00:35,  1.56it/s]
 65%|██████▌   | 102/156 [00:17<00:35,  1.52it/s]
 66%|██████▌   | 103/156 [00:17<00:35,  1.49it/s]
 67%|██████▋   | 104/156 [00:18<00:35,  1.47it/s]
 67%|██████▋   | 105/156 [00:19<00:35,  1.46it/s]
 68%|██████▊   | 106/156 [00:20<00:34,  1.45it/s]
 69%|██████▊   | 107/156 [00:20<00:34,  1.43it/s]
 69%|██████▉   | 108/156 [00:21<00:33,  1.43it/s]
 70%|██████▉   | 109/156 [00:22<00:32,  1.42it/s]
 71%|███████   | 110/156 [00:22<00:32,  1.42it/s]
 71%|███████   | 111/156 [00:23<00:31,  1.42it/s]
 72%|███████▏  | 112/156 [00:24<00:31,  1.42it/s]
 72%|███████▏  | 113/156 [00:25<00:30,  1.41it/s]
 73%|███████▎  | 114/156 [00:25<00:29,  1.41it/s]
 74%|

Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:37:41. Total running time: 19min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2        1            77.8385      0.684093     0.534296           1.9279                  143.681 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

 88%|████████▊ | 137/156 [00:42<00:13,  1.38it/s]
 88%|████████▊ | 138/156 [00:43<00:13,  1.38it/s]
 89%|████████▉ | 139/156 [00:43<00:12,  1.39it/s]
 90%|████████▉ | 140/156 [00:44<00:11,  1.39it/s]
 90%|█████████ | 141/156 [00:45<00:10,  1.39it/s]
 91%|█████████ | 142/156 [00:46<00:10,  1.39it/s]
 92%|█████████▏| 143/156 [00:46<00:09,  1.39it/s]
 92%|█████████▏| 144/156 [00:47<00:08,  1.40it/s]
 93%|█████████▎| 145/156 [00:48<00:07,  1.40it/s]
 94%|█████████▎| 146/156 [00:48<00:07,  1.40it/s]
 94%|█████████▍| 147/156 [00:49<00:06,  1.41it/s]
 95%|█████████▍| 148/156 [00:50<00:05,  1.41it/s]
 96%|█████████▌| 149/156 [00:51<00:04,  1.42it/s]
 96%|█████████▌| 150/156 [00:51<00:04,  1.43it/s]
 97%|█████████▋| 151/156 [00:52<00:03,  1.43it/s]
 97%|█████████▋| 152/156 [00:53<00:02,  1.43it/s]
 98%|█████████▊| 153/156 [00:53<00:02,  1.43it/s]
 99%|█████████▊| 154/156 [00:54<00:01,  1.43it/s]
 99%|█████████▉| 155/156 [00:55<00:00,  1.44it/s]
100%|██████████| 156/156 [00:55<00:00,  1.52it/s]


[36m(_objective pid=11158)[0m {'eval_loss': 0.6392884254455566, 'eval_acc': 0.6245487364620939, 'eval_runtime': 2.0475, 'eval_samples_per_second': 135.29, 'eval_steps_per_second': 4.396, 'epoch': 2.0}


[36m(_objective pid=11158)[0m 
[36m(_objective pid=11158)[0m                                                  
[36m(_objective pid=11158)[0m                                              [A100%|██████████| 156/156 [00:57<00:00,  1.52it/s]
[36m(_objective pid=11158)[0m 100%|██████████| 9/9 [00:01<00:00,  4.86it/s][A
[36m(_objective pid=11158)[0m                                              [A


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:38:11. Total running time: 20min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2        1            77.8385      0.684093     0.534296           1.9279                  143.681 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUSE

[36m(_objective pid=11158)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00003_3_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=11158)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00003_3_num_train_epochs=2_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
100%|██████████| 156/156 [01:30<00:00,  1.72it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00003_3_num_train_epochs=2_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r100%|██████████| 156/156 [01:30<00:00,  1.72it/s]',) kwargs: {}
[36m(_objective pid=11158)[0m Va


Trial _objective_165d9_00004 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00004 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              3 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.01692 |
+-------------------------------------------------+


[36m(_objective pid=11664)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=11664)[0m 2024-06-09 14:38:36.214962: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=11664)[0m 2024-06-09 14:38:36.215028: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=11664)[0m 2024-06-09 14:38:36.216834: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=11664)[0m Some wei


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:38:41. Total running time: 20min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3        1            95.5237      0.639848     0.66787            1.9445                  142.451 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAU

  0%|          | 0/234 [00:00<?, ?it/s]
 34%|███▍      | 79/234 [00:01<00:02, 58.11it/s]
 36%|███▋      | 85/234 [00:05<00:11, 12.52it/s]
 38%|███▊      | 88/234 [00:07<00:17,  8.44it/s]
 38%|███▊      | 90/234 [00:08<00:21,  6.64it/s]
 39%|███▉      | 91/234 [00:09<00:24,  5.83it/s]
 39%|███▉      | 92/234 [00:10<00:28,  5.01it/s]
 40%|███▉      | 93/234 [00:10<00:33,  4.27it/s]
 40%|████      | 94/234 [00:11<00:38,  3.62it/s]
 41%|████      | 95/234 [00:12<00:44,  3.09it/s]
 41%|████      | 96/234 [00:12<00:51,  2.66it/s]
 41%|████▏     | 97/234 [00:13<00:58,  2.33it/s]
 42%|████▏     | 98/234 [00:14<01:05,  2.09it/s]
 42%|████▏     | 99/234 [00:14<01:10,  1.90it/s]
 43%|████▎     | 100/234 [00:15<01:16,  1.76it/s]
 43%|████▎     | 101/234 [00:16<01:19,  1.67it/s]
 44%|████▎     | 102/234 [00:16<01:22,  1.61it/s]
 44%|████▍     | 103/234 [00:17<01:24,  1.56it/s]
 44%|████▍     | 104/234 [00:18<01:25,  1.51it/s]
 45%|████▍     | 105/234 [00:19<01:26,  1.48it/s]
 45%|████▌     | 106/23

Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:39:11. Total running time: 21min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3        1            95.5237      0.639848     0.66787            1.9445                  142.451 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUSE

 48%|████▊     | 113/234 [00:24<01:25,  1.41it/s]
 49%|████▊     | 114/234 [00:25<01:25,  1.41it/s]
 49%|████▉     | 115/234 [00:26<01:25,  1.40it/s]
 50%|████▉     | 116/234 [00:26<01:24,  1.39it/s]
 50%|█████     | 117/234 [00:27<01:24,  1.39it/s]
 50%|█████     | 118/234 [00:28<01:23,  1.39it/s]
 51%|█████     | 119/234 [00:29<01:23,  1.38it/s]
 51%|█████▏    | 120/234 [00:29<01:22,  1.38it/s]
 52%|█████▏    | 121/234 [00:30<01:21,  1.38it/s]
 52%|█████▏    | 122/234 [00:31<01:21,  1.38it/s]
 53%|█████▎    | 123/234 [00:31<01:20,  1.37it/s]
 53%|█████▎    | 124/234 [00:32<01:19,  1.38it/s]
 53%|█████▎    | 125/234 [00:33<01:19,  1.38it/s]
 54%|█████▍    | 126/234 [00:34<01:18,  1.38it/s]
 54%|█████▍    | 127/234 [00:34<01:17,  1.37it/s]
 55%|█████▍    | 128/234 [00:35<01:17,  1.37it/s]
 55%|█████▌    | 129/234 [00:36<01:16,  1.37it/s]
 56%|█████▌    | 130/234 [00:37<01:16,  1.37it/s]
 56%|█████▌    | 131/234 [00:37<01:15,  1.37it/s]
 56%|█████▋    | 132/234 [00:38<01:14,  1.37it/s]


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:39:41. Total running time: 21min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3        1            95.5237      0.639848     0.66787            1.9445                  142.451 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

 66%|██████▌   | 155/234 [00:54<00:55,  1.42it/s]
 67%|██████▋   | 156/234 [00:55<00:51,  1.51it/s]
[36m(_objective pid=11664)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=11664)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  8.62it/s][A
[36m(_objective pid=11664)[0m 
 33%|███▎      | 3/9 [00:00<00:01,  5.98it/s][A
[36m(_objective pid=11664)[0m 
 44%|████▍     | 4/9 [00:00<00:00,  5.25it/s][A
[36m(_objective pid=11664)[0m 
 56%|█████▌    | 5/9 [00:00<00:00,  4.92it/s][A
[36m(_objective pid=11664)[0m 
 67%|██████▋   | 6/9 [00:01<00:00,  4.71it/s][A
[36m(_objective pid=11664)[0m 
 78%|███████▊  | 7/9 [00:01<00:00,  4.57it/s][A
[36m(_objective pid=11664)[0m 
 89%|████████▉ | 8/9 [00:01<00:00,  4.50it/s][A
[36m(_objective pid=11664)[0m 
                                                 
 67%|██████▋   | 156/234 [00:57<00:51,  1.51it/s]
100%|██████████| 9/9 [00:01<00:00,  4.88it/s][A
                                             [A


[36m(_objective pid=11664)[0m {'eval_loss': 0.6226010322570801, 'eval_acc': 0.6859205776173285, 'eval_runtime': 2.0448, 'eval_samples_per_second': 135.465, 'eval_steps_per_second': 4.401, 'epoch': 2.0}
Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:40:11. Total running time: 22min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                     3        1            95.5237      0.639848     0.66787            1.9445            

[36m(_objective pid=11664)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00004 finished iteration 2 at 2024-06-09 14:40:14. Total running time: 22min 9s
+-----------------------------------------------------------+
| Trial _objective_165d9_00004 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000001 |
| time_this_iter_s                                 99.12694 |
| time_total_s                                     194.6506 |
| training_iteration                                      2 |
| epoch                                                  2. |
| eval_acc                                          0.68592 |
| eval_loss                                          0.6226 |
| eval_runtime                                       2.0448 |
| eval_samples_per_second                           135.465 |
| eval_steps_per_second                               4.401 |
| objective                                         0.68592 |
+---------------------------

[36m(_objective pid=11664)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
[36m(_objective pid=11664)[0m  67%|██████▋   | 156/234 [01:28<00:44,  1.77it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r 67%|██████▋   | 156/234 [01:28<00:44,  1.77it/s]',) kwargs: {}
[36m(_objective pid=11664)[0m ValueError when calling 'flush' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_16


Trial _objective_165d9_00005 started with configuration:
+------------------------------------------------+
| Trial _objective_165d9_00005 config            |
+------------------------------------------------+
| learning_rate                            2e-05 |
| max_steps                                   -1 |
| num_train_epochs                             3 |
| per_device_eval_batch_size                  32 |
| per_device_train_batch_size                 32 |
| weight_decay                            0.0637 |
+------------------------------------------------+


[36m(_objective pid=12148)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00005_5_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=12148)[0m 2024-06-09 14:40:20.426974: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=12148)[0m 2024-06-09 14:40:20.427029: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=12148)[0m 2024-06-09 14:40:20.428893: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=12148)[0m Some wei


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:40:41. Total running time: 22min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3        1           148.074       0.677644     0.581227           1.9146                  144.674 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAU

 40%|████      | 94/234 [00:11<00:38,  3.61it/s]
 41%|████      | 95/234 [00:12<00:44,  3.09it/s]
 41%|████      | 96/234 [00:12<00:51,  2.67it/s]
 41%|████▏     | 97/234 [00:13<00:58,  2.33it/s]
 42%|████▏     | 98/234 [00:14<01:05,  2.09it/s]
 42%|████▏     | 99/234 [00:14<01:10,  1.90it/s]
 43%|████▎     | 100/234 [00:15<01:15,  1.76it/s]
 43%|████▎     | 101/234 [00:16<01:19,  1.68it/s]
 44%|████▎     | 102/234 [00:16<01:22,  1.60it/s]
 44%|████▍     | 103/234 [00:17<01:24,  1.56it/s]
 44%|████▍     | 104/234 [00:18<01:25,  1.52it/s]
 45%|████▍     | 105/234 [00:18<01:26,  1.49it/s]
 45%|████▌     | 106/234 [00:19<01:26,  1.47it/s]
 46%|████▌     | 107/234 [00:20<01:27,  1.45it/s]
 46%|████▌     | 108/234 [00:21<01:27,  1.44it/s]
 47%|████▋     | 109/234 [00:21<01:27,  1.43it/s]
 47%|████▋     | 110/234 [00:22<01:27,  1.42it/s]
 47%|████▋     | 111/234 [00:23<01:26,  1.42it/s]
 48%|████▊     | 112/234 [00:23<01:26,  1.41it/s]
 48%|████▊     | 113/234 [00:24<01:25,  1.41it/s]
 49%|█

Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:41:11. Total running time: 23min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3        1           148.074       0.677644     0.581227           1.9146                  144.674 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUSE

 58%|█████▊    | 136/234 [00:41<01:11,  1.37it/s]
 59%|█████▊    | 137/234 [00:42<01:10,  1.38it/s]
 59%|█████▉    | 138/234 [00:42<01:09,  1.39it/s]
 59%|█████▉    | 139/234 [00:43<01:08,  1.39it/s]
 60%|█████▉    | 140/234 [00:44<01:07,  1.39it/s]
 60%|██████    | 141/234 [00:44<01:06,  1.40it/s]
 61%|██████    | 142/234 [00:45<01:05,  1.40it/s]
 61%|██████    | 143/234 [00:46<01:04,  1.40it/s]
 62%|██████▏   | 144/234 [00:47<01:03,  1.41it/s]
 62%|██████▏   | 145/234 [00:47<01:03,  1.41it/s]
 62%|██████▏   | 146/234 [00:48<01:02,  1.41it/s]
 63%|██████▎   | 147/234 [00:49<01:01,  1.41it/s]
 63%|██████▎   | 148/234 [00:49<01:00,  1.41it/s]
 64%|██████▎   | 149/234 [00:50<00:59,  1.42it/s]
 64%|██████▍   | 150/234 [00:51<00:59,  1.42it/s]
 65%|██████▍   | 151/234 [00:52<00:58,  1.42it/s]
 65%|██████▍   | 152/234 [00:52<00:57,  1.42it/s]
 65%|██████▌   | 153/234 [00:53<00:56,  1.42it/s]
 66%|██████▌   | 154/234 [00:54<00:56,  1.42it/s]
 66%|██████▌   | 155/234 [00:54<00:55,  1.43it/s]


[36m(_objective pid=12148)[0m {'eval_loss': 0.6496552228927612, 'eval_acc': 0.6028880866425993, 'eval_runtime': 2.0548, 'eval_samples_per_second': 134.806, 'eval_steps_per_second': 4.38, 'epoch': 2.0}
Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:41:41. Total running time: 23min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                     3        1           148.074       0.677644     0.581227           1.9146            

[36m(_objective pid=12148)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00005_5_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00005 finished iteration 2 at 2024-06-09 14:41:58. Total running time: 23min 52s
+-----------------------------------------------------------+
| Trial _objective_165d9_00005 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000001 |
| time_this_iter_s                                 98.67944 |
| time_total_s                                    246.75308 |
| training_iteration                                      2 |
| epoch                                                  2. |
| eval_acc                                          0.60289 |
| eval_loss                                         0.64966 |
| eval_runtime                                       2.0548 |
| eval_samples_per_second                           134.806 |
| eval_steps_per_second                                4.38 |
| objective                                         0.60289 |
+--------------------------

[36m(_objective pid=12633)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00006_6_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000000)
[36m(_objective pid=12633)[0m 2024-06-09 14:42:04.711750: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=12633)[0m 2024-06-09 14:42:04.711806: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=12633)[0m 2024-06-09 14:42:04.713719: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=12633)[0m Some wei


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:42:11. Total running time: 24min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2        1            91.3687      0.670322     0.581227           1.8513                  149.628 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

  0%|          | 0/156 [00:00<?, ?it/s]
 51%|█████     | 79/156 [00:01<00:01, 59.44it/s]
 54%|█████▍    | 85/156 [00:05<00:05, 12.40it/s]
 56%|█████▋    | 88/156 [00:07<00:08,  8.37it/s]
 58%|█████▊    | 90/156 [00:08<00:10,  6.59it/s]
 58%|█████▊    | 91/156 [00:09<00:11,  5.80it/s]
 59%|█████▉    | 92/156 [00:10<00:12,  4.99it/s]
 60%|█████▉    | 93/156 [00:10<00:14,  4.26it/s]
 60%|██████    | 94/156 [00:11<00:17,  3.62it/s]
 61%|██████    | 95/156 [00:12<00:19,  3.09it/s]
 62%|██████▏   | 96/156 [00:12<00:22,  2.67it/s]
 62%|██████▏   | 97/156 [00:13<00:25,  2.33it/s]
 63%|██████▎   | 98/156 [00:14<00:27,  2.09it/s]
 63%|██████▎   | 99/156 [00:14<00:29,  1.90it/s]
 64%|██████▍   | 100/156 [00:15<00:31,  1.76it/s]
 65%|██████▍   | 101/156 [00:16<00:32,  1.67it/s]
 65%|██████▌   | 102/156 [00:16<00:33,  1.60it/s]
 66%|██████▌   | 103/156 [00:17<00:34,  1.55it/s]
 67%|██████▋   | 104/156 [00:18<00:34,  1.51it/s]
 67%|██████▋   | 105/156 [00:19<00:34,  1.48it/s]
 68%|██████▊   | 106/15

Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:42:41. Total running time: 24min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2        1            91.3687      0.670322     0.581227           1.8513                  149.628 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUS

 74%|███████▎  | 115/156 [00:26<00:29,  1.41it/s]
 74%|███████▍  | 116/156 [00:26<00:28,  1.40it/s]
 75%|███████▌  | 117/156 [00:27<00:27,  1.39it/s]
 76%|███████▌  | 118/156 [00:28<00:27,  1.39it/s]
 76%|███████▋  | 119/156 [00:29<00:26,  1.39it/s]
 77%|███████▋  | 120/156 [00:29<00:26,  1.38it/s]
 78%|███████▊  | 121/156 [00:30<00:25,  1.38it/s]
 78%|███████▊  | 122/156 [00:31<00:24,  1.37it/s]
 79%|███████▉  | 123/156 [00:31<00:24,  1.37it/s]
 79%|███████▉  | 124/156 [00:32<00:23,  1.38it/s]
 80%|████████  | 125/156 [00:33<00:22,  1.38it/s]
 81%|████████  | 126/156 [00:34<00:21,  1.37it/s]
 81%|████████▏ | 127/156 [00:34<00:21,  1.37it/s]
 82%|████████▏ | 128/156 [00:35<00:20,  1.36it/s]
 83%|████████▎ | 129/156 [00:36<00:19,  1.36it/s]
 83%|████████▎ | 130/156 [00:37<00:19,  1.35it/s]
 84%|████████▍ | 131/156 [00:37<00:18,  1.36it/s]
 85%|████████▍ | 132/156 [00:38<00:17,  1.36it/s]
 85%|████████▌ | 133/156 [00:39<00:16,  1.37it/s]
 86%|████████▌ | 134/156 [00:40<00:16,  1.37it/s]


Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:43:11. Total running time: 25min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2        1            91.3687      0.670322     0.581227           1.8513                  149.628 |
| _objective_165d9_00000   PAUSED                      4        2           176.382       0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00001   PAUSE

[36m(_objective pid=12633)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  8.69it/s][A
[36m(_objective pid=12633)[0m 
 33%|███▎      | 3/9 [00:00<00:01,  5.91it/s][A
[36m(_objective pid=12633)[0m 
 44%|████▍     | 4/9 [00:00<00:00,  5.18it/s][A
[36m(_objective pid=12633)[0m 
 56%|█████▌    | 5/9 [00:00<00:00,  4.93it/s][A
[36m(_objective pid=12633)[0m 
 67%|██████▋   | 6/9 [00:01<00:00,  4.74it/s][A
[36m(_objective pid=12633)[0m 
 78%|███████▊  | 7/9 [00:01<00:00,  4.59it/s][A
[36m(_objective pid=12633)[0m 
 89%|████████▉ | 8/9 [00:01<00:00,  4.51it/s][A
[36m(_objective pid=12633)[0m 
                                                 
100%|██████████| 156/156 [00:57<00:00,  1.52it/s]
100%|██████████| 9/9 [00:01<00:00,  4.89it/s][A
                                             [A


[36m(_objective pid=12633)[0m {'eval_loss': 0.6409385800361633, 'eval_acc': 0.6209386281588448, 'eval_runtime': 2.0437, 'eval_samples_per_second': 135.538, 'eval_steps_per_second': 4.404, 'epoch': 2.0}
Trial status: 6 PAUSED | 1 RUNNING | 1 PENDING
Current time: 2024-06-09 14:43:41. Total running time: 25min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                     2        1            91.3687      0.670322     0.581227           1.8513           

[36m(_objective pid=12633)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00006_6_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000001)
2024-06-09 14:43:52,165	ERROR tune_controller.py:1331 -- Trial task failed for trial _objective_165d9_00007
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2613, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/pytho


Trial _objective_165d9_00007 started with configuration:
+------------------------------------------------+
| Trial _objective_165d9_00007 config            |
+------------------------------------------------+
| learning_rate                            3e-05 |
| max_steps                                   -1 |
| num_train_epochs                             2 |
| per_device_eval_batch_size                  32 |
| per_device_train_batch_size                 64 |
| weight_decay                            0.1482 |
+------------------------------------------------+

Trial _objective_165d9_00007 errored after 1 iterations at 2024-06-09 14:43:52. Total running time: 25min 46s
Error file: /tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/driver_artifacts/_objective_165d9_00007_7_num_train_epochs=2_2024-06-09_14-18-05/error.txt

Trial _objective_165d9_00000 started with configuration:
+-------------------------------------------------+
| Trial _

[36m(_objective pid=13186)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=13186)[0m 2024-06-09 14:43:57.610064: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=13186)[0m 2024-06-09 14:43:57.610114: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=13186)[0m 2024-06-09 14:43:57.611460: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=13186)[0m Some wei


Trial status: 1 RUNNING | 1 PENDING | 5 PAUSED | 1 ERROR
Current time: 2024-06-09 14:44:11. Total running time: 26min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4        2            176.382      0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00002   PAUSED                      4        2            180.888      0.629296     0.66065            2.0424                  135.626 |
| _objective_165d9_00

 54%|█████▍    | 170/312 [00:09<00:11, 12.83it/s] 
 56%|█████▋    | 176/312 [00:14<00:15,  8.51it/s]
 57%|█████▋    | 179/312 [00:16<00:19,  6.99it/s]
 58%|█████▊    | 181/312 [00:17<00:21,  6.05it/s]
 59%|█████▊    | 183/312 [00:18<00:25,  5.14it/s]
 59%|█████▉    | 184/312 [00:19<00:27,  4.68it/s]
 59%|█████▉    | 185/312 [00:20<00:30,  4.19it/s]
 60%|█████▉    | 186/312 [00:21<00:34,  3.70it/s]
 60%|█████▉    | 187/312 [00:21<00:38,  3.23it/s]
 60%|██████    | 188/312 [00:22<00:43,  2.83it/s]
 61%|██████    | 189/312 [00:23<00:49,  2.48it/s]
 61%|██████    | 190/312 [00:23<00:55,  2.20it/s]
 61%|██████    | 191/312 [00:24<01:00,  1.98it/s]
 62%|██████▏   | 192/312 [00:25<01:05,  1.82it/s]
 62%|██████▏   | 193/312 [00:26<01:09,  1.70it/s]
 62%|██████▏   | 194/312 [00:26<01:13,  1.61it/s]
 62%|██████▎   | 195/312 [00:27<01:15,  1.55it/s]
 63%|██████▎   | 196/312 [00:28<01:17,  1.50it/s]
 63%|██████▎   | 197/312 [00:28<01:18,  1.46it/s]
 63%|██████▎   | 198/312 [00:29<01:19,  1.43it/s]

Trial status: 1 RUNNING | 1 PENDING | 5 PAUSED | 1 ERROR
Current time: 2024-06-09 14:44:42. Total running time: 26min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4        2            176.382      0.65898      0.613718           2.0216                  137.022 |
| _objective_165d9_00002   PAUSED                      4        2            180.888      0.629296     0.66065            2.0424                  135.626 |
| _objective_165d9_00

 66%|██████▌   | 205/312 [00:34<01:17,  1.38it/s]
 66%|██████▌   | 206/312 [00:35<01:17,  1.37it/s]
 66%|██████▋   | 207/312 [00:36<01:16,  1.37it/s]
 67%|██████▋   | 208/312 [00:36<01:16,  1.36it/s]
 67%|██████▋   | 209/312 [00:37<01:15,  1.37it/s]
 67%|██████▋   | 210/312 [00:38<01:14,  1.37it/s]
 68%|██████▊   | 211/312 [00:39<01:14,  1.36it/s]
 68%|██████▊   | 212/312 [00:39<01:13,  1.37it/s]
 68%|██████▊   | 213/312 [00:40<01:12,  1.37it/s]
 69%|██████▊   | 214/312 [00:41<01:11,  1.37it/s]
 69%|██████▉   | 215/312 [00:42<01:10,  1.37it/s]
 69%|██████▉   | 216/312 [00:42<01:09,  1.38it/s]
 70%|██████▉   | 217/312 [00:43<01:08,  1.38it/s]
 70%|██████▉   | 218/312 [00:44<01:08,  1.38it/s]
 70%|███████   | 219/312 [00:44<01:07,  1.39it/s]
 71%|███████   | 220/312 [00:45<01:06,  1.39it/s]
 71%|███████   | 221/312 [00:46<01:05,  1.40it/s]
 71%|███████   | 222/312 [00:47<01:04,  1.40it/s]
 71%|███████▏  | 223/312 [00:47<01:03,  1.40it/s]
 72%|███████▏  | 224/312 [00:48<01:02,  1.41it/s]


[36m(_objective pid=13186)[0m {'eval_loss': 0.6581993103027344, 'eval_acc': 0.6570397111913358, 'eval_runtime': 2.047, 'eval_samples_per_second': 135.317, 'eval_steps_per_second': 4.397, 'epoch': 3.0}
Trial status: 1 RUNNING | 1 PENDING | 5 PAUSED | 1 ERROR
Current time: 2024-06-09 14:45:12. Total running time: 27min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                     4        2            176.382      0.65898      0.613718           2.0216   

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00000 finished iteration 3 at 2024-06-09 14:45:26. Total running time: 27min 21s
+-----------------------------------------------------------+
| Trial _objective_165d9_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000002 |
| time_this_iter_s                                 89.88703 |
| time_total_s                                    266.26859 |
| training_iteration                                      3 |
| epoch                                                  3. |
| eval_acc                                          0.65704 |
| eval_loss                                          0.6582 |
| eval_runtime                                        2.047 |
| eval_samples_per_second                           135.317 |
| eval_steps_per_second                               4.397 |
| objective                                         0.65704 |
+--------------------------

[36m(_objective pid=13186)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000002)



Trial _objective_165d9_00001 started with configuration:
+------------------------------------------------+
| Trial _objective_165d9_00001 config            |
+------------------------------------------------+
| learning_rate                            2e-05 |
| max_steps                                   -1 |
| num_train_epochs                             5 |
| per_device_eval_batch_size                  32 |
| per_device_train_batch_size                 32 |
| weight_decay                            0.1796 |
+------------------------------------------------+


[36m(_objective pid=13629)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=13629)[0m 2024-06-09 14:45:31.817460: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=13629)[0m 2024-06-09 14:45:31.817508: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=13629)[0m 2024-06-09 14:45:31.818840: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=13629)[0m Some wei


Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:45:42. Total running time: 27min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5        2            178.333      0.64892      0.631769           2.0251                  136.787 |
| _objective_165d9_00000   PAUSED                      4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_0

  0%|          | 0/390 [00:00<?, ?it/s]
 40%|████      | 157/390 [00:01<00:01, 118.47it/s]
 43%|████▎     | 169/390 [00:09<00:16, 13.59it/s] 
 45%|████▍     | 174/390 [00:12<00:23,  9.32it/s]
 45%|████▌     | 177/390 [00:14<00:28,  7.49it/s]
 46%|████▌     | 179/390 [00:16<00:33,  6.39it/s]
 46%|████▋     | 181/390 [00:17<00:38,  5.37it/s]
 47%|████▋     | 182/390 [00:18<00:42,  4.86it/s]
 47%|████▋     | 183/390 [00:18<00:47,  4.33it/s]
 47%|████▋     | 184/390 [00:19<00:54,  3.80it/s]
 47%|████▋     | 185/390 [00:20<01:02,  3.29it/s]
 48%|████▊     | 186/390 [00:21<01:11,  2.86it/s]
 48%|████▊     | 187/390 [00:21<01:21,  2.50it/s]
 48%|████▊     | 188/390 [00:22<01:31,  2.22it/s]
 48%|████▊     | 189/390 [00:23<01:40,  2.00it/s]
 49%|████▊     | 190/390 [00:23<01:49,  1.83it/s]
 49%|████▉     | 191/390 [00:24<01:56,  1.71it/s]
 49%|████▉     | 192/390 [00:25<02:01,  1.63it/s]
 49%|████▉     | 193/390 [00:26<02:06,  1.56it/s]
 50%|████▉     | 194/390 [00:26<02:09,  1.52it/s]
 50%|███

Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:46:12. Total running time: 28min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5        2            178.333      0.64892      0.631769           2.0251                  136.787 |
| _objective_165d9_00000   PAUSED                      4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_000

 51%|█████     | 198/390 [00:29<02:14,  1.42it/s]
 51%|█████     | 199/390 [00:30<02:15,  1.41it/s]
 51%|█████▏    | 200/390 [00:31<02:14,  1.41it/s]
 52%|█████▏    | 201/390 [00:31<02:15,  1.40it/s]
 52%|█████▏    | 202/390 [00:32<02:15,  1.39it/s]
 52%|█████▏    | 203/390 [00:33<02:15,  1.38it/s]
 52%|█████▏    | 204/390 [00:33<02:14,  1.38it/s]
 53%|█████▎    | 205/390 [00:34<02:14,  1.38it/s]
 53%|█████▎    | 206/390 [00:35<02:14,  1.37it/s]
 53%|█████▎    | 207/390 [00:36<02:13,  1.37it/s]
 53%|█████▎    | 208/390 [00:36<02:13,  1.36it/s]
 54%|█████▎    | 209/390 [00:37<02:12,  1.36it/s]
 54%|█████▍    | 210/390 [00:38<02:12,  1.36it/s]
 54%|█████▍    | 211/390 [00:39<02:11,  1.36it/s]
 54%|█████▍    | 212/390 [00:39<02:10,  1.36it/s]
 55%|█████▍    | 213/390 [00:40<02:09,  1.36it/s]
 55%|█████▍    | 214/390 [00:41<02:08,  1.37it/s]
 55%|█████▌    | 215/390 [00:42<02:07,  1.37it/s]
 55%|█████▌    | 216/390 [00:42<02:06,  1.38it/s]
 56%|█████▌    | 217/390 [00:43<02:05,  1.38it/s]


[36m(_objective pid=13629)[0m {'eval_loss': 0.6645092368125916, 'eval_acc': 0.6389891696750902, 'eval_runtime': 2.0506, 'eval_samples_per_second': 135.08, 'eval_steps_per_second': 4.389, 'epoch': 3.0}
Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:46:42. Total running time: 28min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                     5        2            178.333      0.64892      0.631769           2.0251  

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(_objective pid=13629)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000002)



Trial _objective_165d9_00001 finished iteration 3 at 2024-06-09 14:46:57. Total running time: 28min 52s
+-----------------------------------------------------------+
| Trial _objective_165d9_00001 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000002 |
| time_this_iter_s                                 86.75413 |
| time_total_s                                    265.08685 |
| training_iteration                                      3 |
| epoch                                                  3. |
| eval_acc                                          0.63899 |
| eval_loss                                         0.66451 |
| eval_runtime                                       2.0506 |
| eval_samples_per_second                            135.08 |
| eval_steps_per_second                               4.389 |
| objective                                         0.63899 |
+--------------------------

[36m(_objective pid=14058)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=14058)[0m 2024-06-09 14:47:04.359447: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=14058)[0m 2024-06-09 14:47:04.359505: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=14058)[0m 2024-06-09 14:47:04.361248: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=14058)[0m Some wei


Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:47:12. Total running time: 29min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4        2            180.888      0.629296     0.66065            2.0424                  135.626 |
| _objective_165d9_00000   PAUSED                      4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_00

  0%|          | 0/312 [00:00<?, ?it/s]
 50%|█████     | 157/312 [00:01<00:01, 118.53it/s]
 54%|█████▍    | 169/312 [00:09<00:10, 13.58it/s] 
 56%|█████▌    | 174/312 [00:12<00:14,  9.33it/s]
 57%|█████▋    | 177/312 [00:14<00:18,  7.49it/s]
 57%|█████▋    | 179/312 [00:16<00:20,  6.38it/s]
 58%|█████▊    | 181/312 [00:17<00:24,  5.34it/s]
 58%|█████▊    | 182/312 [00:18<00:26,  4.83it/s]
 59%|█████▊    | 183/312 [00:19<00:30,  4.29it/s]
 59%|█████▉    | 184/312 [00:19<00:34,  3.76it/s]
 59%|█████▉    | 185/312 [00:20<00:38,  3.27it/s]
 60%|█████▉    | 186/312 [00:21<00:44,  2.85it/s]
 60%|█████▉    | 187/312 [00:21<00:50,  2.48it/s]
 60%|██████    | 188/312 [00:22<00:56,  2.21it/s]
 61%|██████    | 189/312 [00:23<01:01,  2.00it/s]
 61%|██████    | 190/312 [00:24<01:06,  1.83it/s]
 61%|██████    | 191/312 [00:24<01:10,  1.71it/s]
 62%|██████▏   | 192/312 [00:25<01:13,  1.63it/s]
 62%|██████▏   | 193/312 [00:26<01:16,  1.56it/s]
 62%|██████▏   | 194/312 [00:26<01:17,  1.51it/s]


Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:47:42. Total running time: 29min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4        2            180.888      0.629296     0.66065            2.0424                  135.626 |
| _objective_165d9_00000   PAUSED                      4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_00

 62%|██████▎   | 195/312 [00:27<01:19,  1.48it/s]
 63%|██████▎   | 196/312 [00:28<01:19,  1.45it/s]
 63%|██████▎   | 197/312 [00:29<01:20,  1.43it/s]
 63%|██████▎   | 198/312 [00:29<01:20,  1.41it/s]
 64%|██████▍   | 199/312 [00:30<01:20,  1.40it/s]
 64%|██████▍   | 200/312 [00:31<01:20,  1.40it/s]
 64%|██████▍   | 201/312 [00:31<01:19,  1.39it/s]
 65%|██████▍   | 202/312 [00:32<01:19,  1.38it/s]
 65%|██████▌   | 203/312 [00:33<01:19,  1.38it/s]
 65%|██████▌   | 204/312 [00:34<01:18,  1.37it/s]
 66%|██████▌   | 205/312 [00:34<01:18,  1.37it/s]
 66%|██████▌   | 206/312 [00:35<01:17,  1.36it/s]
 66%|██████▋   | 207/312 [00:36<01:17,  1.35it/s]
 67%|██████▋   | 208/312 [00:37<01:16,  1.36it/s]
 67%|██████▋   | 209/312 [00:37<01:15,  1.36it/s]
 67%|██████▋   | 210/312 [00:38<01:14,  1.36it/s]
 68%|██████▊   | 211/312 [00:39<01:14,  1.36it/s]
 68%|██████▊   | 212/312 [00:40<01:13,  1.36it/s]
 68%|██████▊   | 213/312 [00:40<01:12,  1.36it/s]
 69%|██████▊   | 214/312 [00:41<01:11,  1.37it/s]


Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:48:12. Total running time: 30min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                     4        2            180.888      0.629296     0.66065            2.0424                  135.626 |
| _objective_165d9_00000   PAUSED                      4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_000

[36m(_objective pid=14058)[0m 
[36m(_objective pid=14058)[0m  89%|████████▉ | 8/9 [00:01<00:00,  4.48it/s][A
[36m(_objective pid=14058)[0m 
[36m(_objective pid=14058)[0m                                                  
[36m(_objective pid=14058)[0m                                              [A 75%|███████▌  | 234/312 [00:57<00:51,  1.51it/s]
[36m(_objective pid=14058)[0m 100%|██████████| 9/9 [00:01<00:00,  4.84it/s][A
[36m(_objective pid=14058)[0m                                              [A



Trial _objective_165d9_00002 finished iteration 3 at 2024-06-09 14:48:33. Total running time: 30min 28s
+-----------------------------------------------------------+
| Trial _objective_165d9_00002 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000002 |
| time_this_iter_s                                 90.70175 |
| time_total_s                                    271.58973 |
| training_iteration                                      3 |
| epoch                                                  3. |
| eval_acc                                          0.67148 |
| eval_loss                                         0.69701 |
| eval_runtime                                       2.0547 |
| eval_samples_per_second                           134.815 |
| eval_steps_per_second                                4.38 |
| objective                                         0.67148 |
+--------------------------

[36m(_objective pid=14058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000002)



Trial _objective_165d9_00003 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00003 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              2 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.18033 |
+-------------------------------------------------+


[36m(_objective pid=14512)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00003_3_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=14512)[0m 2024-06-09 14:48:40.241596: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=14512)[0m 2024-06-09 14:48:40.241648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=14512)[0m 2024-06-09 14:48:40.242939: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Trial status: 5 PAUSED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:48:42. Total running time: 30min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00003   RUNNING                     2        2            178.896      0.639288     0.624549           2.0475                  135.29  |
| _objective_165d9_00000   PAUSED                      4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_0

[36m(_objective pid=14512)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
[36m(_objective pid=14512)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trial _objective_165d9_00003 completed after 2 iterations at 2024-06-09 14:48:50. Total running time: 30min 45s
[36m(_objective pid=14512)[0m {'train_runtime': 0.0043, 'train_samples_per_second': 1164110.456, 'train_steps_per_second': 36466.111, 'train_loss': 0.0, 'epoch': 2.0}


[36m(_objective pid=14512)[0m   0%|          | 0/156 [00:00<?, ?it/s]                                         0%|          | 0/156 [00:00<?, ?it/s]  0%|          | 0/156 [00:00<?, ?it/s]



Trial _objective_165d9_00004 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00004 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              3 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.01692 |
+-------------------------------------------------+


[36m(_objective pid=14614)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=14614)[0m 2024-06-09 14:48:55.603649: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=14614)[0m 2024-06-09 14:48:55.603712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=14614)[0m 2024-06-09 14:48:55.605225: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=14614)[0m Some wei


Trial status: 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:49:12. Total running time: 31min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                       3        2            194.651      0.622601     0.685921           2.0448                  135.465 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 72%|███████▏  | 169/234 [00:09<00:04, 13.62it/s] 
 74%|███████▍  | 174/234 [00:12<00:06,  9.31it/s]
 76%|███████▌  | 177/234 [00:14<00:07,  7.48it/s]
 76%|███████▋  | 179/234 [00:16<00:08,  6.37it/s]
 77%|███████▋  | 181/234 [00:17<00:09,  5.35it/s]
 78%|███████▊  | 182/234 [00:18<00:10,  4.83it/s]
 78%|███████▊  | 183/234 [00:19<00:11,  4.30it/s]
 79%|███████▊  | 184/234 [00:19<00:13,  3.77it/s]
 79%|███████▉  | 185/234 [00:20<00:14,  3.28it/s]
 79%|███████▉  | 186/234 [00:21<00:16,  2.85it/s]
 80%|███████▉  | 187/234 [00:21<00:18,  2.49it/s]
 80%|████████  | 188/234 [00:22<00:20,  2.21it/s]
 81%|████████  | 189/234 [00:23<00:22,  1.99it/s]
 81%|████████  | 190/234 [00:24<00:24,  1.82it/s]
 82%|████████▏ | 191/234 [00:24<00:25,  1.70it/s]
 82%|████████▏ | 192/234 [00:25<00:26,  1.61it/s]
 82%|████████▏ | 193/234 [00:26<00:26,  1.54it/s]
 83%|████████▎ | 194/234 [00:26<00:26,  1.49it/s]
 83%|████████▎ | 195/234 [00:27<00:26,  1.46it/s]
 84%|████████▍ | 196/234 [00:28<00:26,  1.43it/s]

Trial status: 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:49:42. Total running time: 31min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                       3        2            194.651      0.622601     0.685921           2.0448                  135.465 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 89%|████████▉ | 208/234 [00:37<00:19,  1.36it/s]
 89%|████████▉ | 209/234 [00:37<00:18,  1.37it/s]
 90%|████████▉ | 210/234 [00:38<00:17,  1.37it/s]
 90%|█████████ | 211/234 [00:39<00:16,  1.37it/s]
 91%|█████████ | 212/234 [00:40<00:16,  1.37it/s]
 91%|█████████ | 213/234 [00:40<00:15,  1.37it/s]
 91%|█████████▏| 214/234 [00:41<00:14,  1.37it/s]
 92%|█████████▏| 215/234 [00:42<00:13,  1.37it/s]
 92%|█████████▏| 216/234 [00:42<00:13,  1.37it/s]
 93%|█████████▎| 217/234 [00:43<00:12,  1.38it/s]
 93%|█████████▎| 218/234 [00:44<00:11,  1.38it/s]
 94%|█████████▎| 219/234 [00:45<00:10,  1.38it/s]
 94%|█████████▍| 220/234 [00:45<00:10,  1.38it/s]
 94%|█████████▍| 221/234 [00:46<00:09,  1.39it/s]
 95%|█████████▍| 222/234 [00:47<00:08,  1.40it/s]
 95%|█████████▌| 223/234 [00:47<00:07,  1.41it/s]
 96%|█████████▌| 224/234 [00:48<00:07,  1.42it/s]
 96%|█████████▌| 225/234 [00:49<00:06,  1.43it/s]
 97%|█████████▋| 226/234 [00:50<00:05,  1.43it/s]
 97%|█████████▋| 227/234 [00:50<00:04,  1.43it/s]


[36m(_objective pid=14614)[0m {'eval_loss': 0.6531721353530884, 'eval_acc': 0.6967509025270758, 'eval_runtime': 2.0428, 'eval_samples_per_second': 135.596, 'eval_steps_per_second': 4.406, 'epoch': 3.0}


[36m(_objective pid=14614)[0m 
[36m(_objective pid=14614)[0m                                                  
[36m(_objective pid=14614)[0m                                              [A100%|██████████| 234/234 [00:57<00:00,  1.52it/s]
[36m(_objective pid=14614)[0m 100%|██████████| 9/9 [00:01<00:00,  4.85it/s][A
[36m(_objective pid=14614)[0m                                              [A


Trial status: 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:50:12. Total running time: 32min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                       3        2            194.651      0.622601     0.685921           2.0448                  135.465 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29 

[36m(_objective pid=14614)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000002)



Trial _objective_165d9_00004 finished iteration 3 at 2024-06-09 14:50:40. Total running time: 32min 34s
+-----------------------------------------------------------+
| Trial _objective_165d9_00004 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000002 |
| time_this_iter_s                                105.44416 |
| time_total_s                                    300.09476 |
| training_iteration                                      3 |
| epoch                                                  3. |
| eval_acc                                          0.69675 |
| eval_loss                                         0.65317 |
| eval_runtime                                       2.0428 |
| eval_samples_per_second                           135.596 |
| eval_steps_per_second                               4.406 |
| objective                                         0.69675 |
+--------------------------

[36m(_objective pid=15117)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00005_5_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=15117)[0m 2024-06-09 14:50:46.381995: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=15117)[0m 2024-06-09 14:50:46.382059: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=15117)[0m 2024-06-09 14:50:46.387456: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=15117)[0m Some wei


Trial status: 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:51:12. Total running time: 33min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                       3        2            246.753      0.649655     0.602888           2.0548                  134.806 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 76%|███████▋  | 179/234 [00:16<00:08,  6.39it/s]
 77%|███████▋  | 181/234 [00:17<00:09,  5.36it/s]
 78%|███████▊  | 182/234 [00:18<00:10,  4.84it/s]
 78%|███████▊  | 183/234 [00:18<00:11,  4.31it/s]
 79%|███████▊  | 184/234 [00:19<00:13,  3.78it/s]
 79%|███████▉  | 185/234 [00:20<00:14,  3.28it/s]
 79%|███████▉  | 186/234 [00:21<00:16,  2.85it/s]
 80%|███████▉  | 187/234 [00:21<00:18,  2.49it/s]
 80%|████████  | 188/234 [00:22<00:20,  2.21it/s]
 81%|████████  | 189/234 [00:23<00:22,  1.99it/s]
 81%|████████  | 190/234 [00:23<00:24,  1.83it/s]
 82%|████████▏ | 191/234 [00:24<00:25,  1.71it/s]
 82%|████████▏ | 192/234 [00:25<00:25,  1.62it/s]
 82%|████████▏ | 193/234 [00:26<00:26,  1.55it/s]
 83%|████████▎ | 194/234 [00:26<00:26,  1.50it/s]
 83%|████████▎ | 195/234 [00:27<00:26,  1.47it/s]
 84%|████████▍ | 196/234 [00:28<00:26,  1.45it/s]
 84%|████████▍ | 197/234 [00:28<00:26,  1.42it/s]
 85%|████████▍ | 198/234 [00:29<00:25,  1.40it/s]
 85%|████████▌ | 199/234 [00:30<00:25,  1.40it/s]


Trial status: 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:51:42. Total running time: 33min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                       3        2            246.753      0.649655     0.602888           2.0548                  134.806 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 94%|█████████▍| 220/234 [00:45<00:10,  1.40it/s]
 94%|█████████▍| 221/234 [00:46<00:09,  1.41it/s]
 95%|█████████▍| 222/234 [00:47<00:08,  1.40it/s]
 95%|█████████▌| 223/234 [00:47<00:07,  1.40it/s]
 96%|█████████▌| 224/234 [00:48<00:07,  1.42it/s]
 96%|█████████▌| 225/234 [00:49<00:06,  1.42it/s]
 97%|█████████▋| 226/234 [00:49<00:05,  1.41it/s]
 97%|█████████▋| 227/234 [00:50<00:04,  1.42it/s]
 97%|█████████▋| 228/234 [00:51<00:04,  1.42it/s]
 98%|█████████▊| 229/234 [00:52<00:03,  1.43it/s]
 98%|█████████▊| 230/234 [00:52<00:02,  1.43it/s]
 99%|█████████▊| 231/234 [00:53<00:02,  1.43it/s]
 99%|█████████▉| 232/234 [00:54<00:01,  1.43it/s]
100%|█████████▉| 233/234 [00:54<00:00,  1.44it/s]
100%|██████████| 234/234 [00:55<00:00,  1.52it/s]
[36m(_objective pid=15117)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=15117)[0m 
 22%|██▏       | 2/9 [00:00<00:00,  8.51it/s][A
[36m(_objective pid=15117)[0m 
 33%|███▎      | 3/9 [00:00<00:01,  5.93it/s][A
[36m(_objec

[36m(_objective pid=15117)[0m {'eval_loss': 0.6421403288841248, 'eval_acc': 0.6353790613718412, 'eval_runtime': 2.0472, 'eval_samples_per_second': 135.304, 'eval_steps_per_second': 4.396, 'epoch': 3.0}
Trial status: 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 14:52:13. Total running time: 34min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00005   RUNNING                       3        2            246.753      0.649655     0.60

[36m(_objective pid=15117)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00005_5_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000002)
2024-06-09 14:52:27,340	INFO pbt.py:878 -- 

[PopulationBasedTraining] [Exploit] Cloning trial 165d9_00002 (score = 0.671480) into trial 165d9_00005 (score = 0.635379)

2024-06-09 14:52:27,342	INFO pbt.py:905 -- 

[PopulationBasedTraining] [Explore] Perturbed the hyperparameter config of trial165d9_00005:
per_device_train_batch_size : 32 --- (shift left) --> 16
weight_decay : 0.029992474745400864 --- (* 1.2) --> 0.03599096969448103
learning_rate : 2.836995567863469e-05 --- (* 0.8) --> 2.2695964542907755e-05




Trial _objective_165d9_00005 finished iteration 3 at 2024-06-09 14:52:27. Total running time: 34min 22s
+-----------------------------------------------------------+
| Trial _objective_165d9_00005 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000002 |
| time_this_iter_s                                102.30624 |
| time_total_s                                    349.05932 |
| training_iteration                                      3 |
| epoch                                                  3. |
| eval_acc                                          0.63538 |
| eval_loss                                         0.64214 |
| eval_runtime                                       2.0472 |
| eval_samples_per_second                           135.304 |
| eval_steps_per_second                               4.396 |
| objective                                         0.63538 |
+--------------------------

[36m(_objective pid=15613)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00006_6_num_train_epochs=2_2024-06-09_14-18-05/checkpoint_000001)
[36m(_objective pid=15613)[0m 2024-06-09 14:52:34.659273: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=15613)[0m 2024-06-09 14:52:34.659337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=15613)[0m 2024-06-09 14:52:34.663615: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=15613)[0m Some wei


Trial status: 1 PENDING | 4 PAUSED | 1 TERMINATED | 1 RUNNING | 1 ERROR
Current time: 2024-06-09 14:52:43. Total running time: 34min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00006   RUNNING                       2        2            193.239      0.640939     0.620939           2.0437                  135.538 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.2

[36m(_objective pid=15613)[0m   0%|          | 0/156 [00:00<?, ?it/s]                                         0%|          | 0/156 [00:00<?, ?it/s]  0%|          | 0/156 [00:00<?, ?it/s]
[36m(_objective pid=15613)[0m Exception ignored in atexit callback: <function shutdown at 0x7f6569c0ca60>
[36m(_objective pid=15613)[0m Traceback (most recent call last):
[36m(_objective pid=15613)[0m   File "/usr/lib/python3.10/logging/__init__.py", line 2183, in shutdown
[36m(_objective pid=15613)[0m     h.close()
[36m(_objective pid=15613)[0m   File "/usr/local/lib/python3.10/dist-packages/absl/logging/__init__.py", line 944, in close
[36m(_objective pid=15613)[0m     self.stream.close()
[36m(_objective pid=15613)[0m AttributeError: 'Tee' object has no attribute 'close'



Trial _objective_165d9_00000 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00000 config             |
+-------------------------------------------------+
| learning_rate                             2e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.23896 |
+-------------------------------------------------+


[36m(_objective pid=15727)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000002)
[36m(_objective pid=15727)[0m 2024-06-09 14:52:52.190266: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=15727)[0m 2024-06-09 14:52:52.190337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=15727)[0m 2024-06-09 14:52:52.191701: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=15727)[0m Some wei


Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:53:13. Total running time: 35min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 81%|████████  | 253/312 [00:13<00:04, 13.93it/s] 
 81%|████████▏ | 254/312 [00:14<00:04, 13.06it/s]
 84%|████████▍ | 262/312 [00:19<00:06,  7.83it/s]
 86%|████████▌ | 267/312 [00:23<00:07,  5.96it/s]
 87%|████████▋ | 270/312 [00:25<00:08,  5.04it/s]
 87%|████████▋ | 272/312 [00:27<00:09,  4.44it/s]
 88%|████████▊ | 274/312 [00:28<00:09,  3.87it/s]
 88%|████████▊ | 275/312 [00:29<00:10,  3.57it/s]
 88%|████████▊ | 276/312 [00:29<00:11,  3.25it/s]
 89%|████████▉ | 277/312 [00:30<00:11,  2.92it/s]
 89%|████████▉ | 278/312 [00:31<00:12,  2.62it/s]
 89%|████████▉ | 279/312 [00:32<00:14,  2.34it/s]
 90%|████████▉ | 280/312 [00:32<00:15,  2.11it/s]
 90%|█████████ | 281/312 [00:33<00:16,  1.91it/s]
 90%|█████████ | 282/312 [00:34<00:17,  1.76it/s]
 91%|█████████ | 283/312 [00:35<00:17,  1.66it/s]
 91%|█████████ | 284/312 [00:35<00:17,  1.57it/s]
 91%|█████████▏| 285/312 [00:36<00:17,  1.52it/s]
 92%|█████████▏| 286/312 [00:37<00:17,  1.48it/s]
 92%|█████████▏| 287/312 [00:38<00:17,  1.45it/s]

Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:53:43. Total running time: 35min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 93%|█████████▎| 290/312 [00:40<00:15,  1.41it/s]
 93%|█████████▎| 291/312 [00:40<00:14,  1.40it/s]
 94%|█████████▎| 292/312 [00:41<00:14,  1.40it/s]
 94%|█████████▍| 293/312 [00:42<00:13,  1.39it/s]
 94%|█████████▍| 294/312 [00:43<00:12,  1.39it/s]
 95%|█████████▍| 295/312 [00:43<00:12,  1.40it/s]
 95%|█████████▍| 296/312 [00:44<00:11,  1.41it/s]
 95%|█████████▌| 297/312 [00:45<00:10,  1.41it/s]
 96%|█████████▌| 298/312 [00:45<00:09,  1.42it/s]
 96%|█████████▌| 299/312 [00:46<00:09,  1.43it/s]
 96%|█████████▌| 300/312 [00:47<00:08,  1.43it/s]
 96%|█████████▋| 301/312 [00:48<00:07,  1.42it/s]
 97%|█████████▋| 302/312 [00:48<00:06,  1.43it/s]
 97%|█████████▋| 303/312 [00:49<00:06,  1.43it/s]
 97%|█████████▋| 304/312 [00:50<00:05,  1.44it/s]
 98%|█████████▊| 305/312 [00:50<00:04,  1.44it/s]
 98%|█████████▊| 306/312 [00:51<00:04,  1.44it/s]
 98%|█████████▊| 307/312 [00:52<00:03,  1.44it/s]
 99%|█████████▊| 308/312 [00:52<00:02,  1.44it/s]
 99%|█████████▉| 309/312 [00:53<00:02,  1.45it/s]


[36m(_objective pid=15727)[0m {'eval_loss': 0.6708900928497314, 'eval_acc': 0.6209386281588448, 'eval_runtime': 2.0241, 'eval_samples_per_second': 136.849, 'eval_steps_per_second': 4.446, 'epoch': 4.0}
Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:54:13. Total running time: 36min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            266.269      0.658199     0.65



Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:54:43. Total running time: 36min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            266.269      0.658199     0.65704            2.047                   135.317 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

2024-06-09 14:54:47,534	INFO pbt.py:878 -- 

[PopulationBasedTraining] [Exploit] Cloning trial 165d9_00002 (score = 0.671480) into trial 165d9_00000 (score = 0.620939)

2024-06-09 14:54:47,536	INFO pbt.py:905 -- 

[PopulationBasedTraining] [Explore] Perturbed the hyperparameter config of trial165d9_00000:
per_device_train_batch_size : 32 --- (resample) --> 32
weight_decay : 0.029992474745400864 --- (* 0.8) --> 0.023993979796320692
learning_rate : 2.836995567863469e-05 --- (* 0.8) --> 2.2695964542907755e-05

[36m(_objective pid=15727)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000003)



Trial _objective_165d9_00000 finished iteration 4 at 2024-06-09 14:54:47. Total running time: 36min 42s
+-----------------------------------------------------------+
| Trial _objective_165d9_00000 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000003 |
| time_this_iter_s                                116.22379 |
| time_total_s                                    382.49238 |
| training_iteration                                      4 |
| epoch                                                  4. |
| eval_acc                                          0.62094 |
| eval_loss                                         0.67089 |
| eval_runtime                                       2.0241 |
| eval_samples_per_second                           136.849 |
| eval_steps_per_second                               4.446 |
| objective                                         0.62094 |
+--------------------------

[36m(_objective pid=16287)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000002)
[36m(_objective pid=16287)[0m 2024-06-09 14:54:53.952722: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=16287)[0m 2024-06-09 14:54:53.952777: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=16287)[0m 2024-06-09 14:54:53.954547: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=16287)[0m Some wei


Trial status: 1 PENDING | 1 RUNNING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:55:13. Total running time: 37min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       5        3            265.087      0.664509     0.638989           2.0506                  135.08  |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 65%|██████▌   | 254/390 [00:14<00:10, 13.32it/s] 
 65%|██████▌   | 255/390 [00:14<00:10, 12.53it/s]
 67%|██████▋   | 263/390 [00:20<00:16,  7.65it/s]
 69%|██████▊   | 268/390 [00:24<00:20,  5.87it/s]
 69%|██████▉   | 271/390 [00:26<00:23,  4.96it/s]
 70%|███████   | 273/390 [00:27<00:26,  4.39it/s]
 71%|███████   | 275/390 [00:29<00:29,  3.83it/s]
 71%|███████   | 276/390 [00:29<00:32,  3.54it/s]
 71%|███████   | 277/390 [00:30<00:35,  3.21it/s]
 71%|███████▏  | 278/390 [00:31<00:38,  2.91it/s]
 72%|███████▏  | 279/390 [00:32<00:42,  2.60it/s]
 72%|███████▏  | 280/390 [00:33<00:54,  2.03it/s]
 72%|███████▏  | 281/390 [00:34<00:57,  1.90it/s]
 72%|███████▏  | 282/390 [00:34<01:01,  1.76it/s]
 73%|███████▎  | 283/390 [00:35<01:04,  1.66it/s]
 73%|███████▎  | 284/390 [00:36<01:07,  1.58it/s]
 73%|███████▎  | 285/390 [00:37<01:08,  1.53it/s]
 73%|███████▎  | 286/390 [00:37<01:10,  1.48it/s]


Trial status: 1 PENDING | 1 RUNNING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:55:43. Total running time: 37min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       5        3            265.087      0.664509     0.638989           2.0506                  135.08  |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 74%|███████▎  | 287/390 [00:38<01:10,  1.46it/s]
 74%|███████▍  | 288/390 [00:39<01:11,  1.43it/s]
 74%|███████▍  | 289/390 [00:39<01:11,  1.42it/s]
 74%|███████▍  | 290/390 [00:40<01:10,  1.41it/s]
 75%|███████▍  | 291/390 [00:41<01:10,  1.40it/s]
 75%|███████▍  | 292/390 [00:42<01:09,  1.40it/s]
 75%|███████▌  | 293/390 [00:42<01:09,  1.40it/s]
 75%|███████▌  | 294/390 [00:43<01:08,  1.40it/s]
 76%|███████▌  | 295/390 [00:44<01:07,  1.40it/s]
 76%|███████▌  | 296/390 [00:45<01:07,  1.40it/s]
 76%|███████▌  | 297/390 [00:45<01:05,  1.41it/s]
 76%|███████▋  | 298/390 [00:46<01:05,  1.41it/s]
 77%|███████▋  | 299/390 [00:47<01:04,  1.41it/s]
 77%|███████▋  | 300/390 [00:47<01:03,  1.42it/s]
 77%|███████▋  | 301/390 [00:48<01:03,  1.41it/s]
 77%|███████▋  | 302/390 [00:49<01:02,  1.42it/s]
 78%|███████▊  | 303/390 [00:49<01:01,  1.42it/s]
 78%|███████▊  | 304/390 [00:50<01:00,  1.42it/s]
 78%|███████▊  | 305/390 [00:51<00:59,  1.42it/s]
 78%|███████▊  | 306/390 [00:52<00:59,  1.42it/s]


[36m(_objective pid=16287)[0m {'eval_loss': 0.7334033250808716, 'eval_acc': 0.6353790613718412, 'eval_runtime': 2.0276, 'eval_samples_per_second': 136.615, 'eval_steps_per_second': 4.439, 'epoch': 4.0}




Trial status: 1 PENDING | 1 RUNNING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:56:13. Total running time: 38min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       5        3            265.087      0.664509     0.638989           2.0506                  135.08  |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29 



Trial status: 1 PENDING | 1 RUNNING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:56:43. Total running time: 38min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       5        3            265.087      0.664509     0.638989           2.0506                  135.08  |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

2024-06-09 14:56:45,260	INFO pbt.py:878 -- 

[PopulationBasedTraining] [Exploit] Cloning trial 165d9_00002 (score = 0.671480) into trial 165d9_00001 (score = 0.635379)

2024-06-09 14:56:45,263	INFO pbt.py:905 -- 

[PopulationBasedTraining] [Explore] Perturbed the hyperparameter config of trial165d9_00001:
per_device_train_batch_size : 32 --- (shift left) --> 16
weight_decay : 0.029992474745400864 --- (* 1.2) --> 0.03599096969448103
learning_rate : 2.836995567863469e-05 --- (* 1.2) --> 3.4043946814361626e-05

[36m(_objective pid=16287)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000003)



Trial _objective_165d9_00001 finished iteration 4 at 2024-06-09 14:56:45. Total running time: 38min 40s
+-----------------------------------------------------------+
| Trial _objective_165d9_00001 result                       |
+-----------------------------------------------------------+
| checkpoint_dir_name                     checkpoint_000003 |
| time_this_iter_s                                112.55985 |
| time_total_s                                     377.6467 |
| training_iteration                                      4 |
| epoch                                                  4. |
| eval_acc                                          0.63538 |
| eval_loss                                          0.7334 |
| eval_runtime                                       2.0276 |
| eval_samples_per_second                           136.615 |
| eval_steps_per_second                               4.439 |
| objective                                         0.63538 |
+--------------------------

[36m(_objective pid=16835)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000002)
[36m(_objective pid=16835)[0m 2024-06-09 14:56:52.667000: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=16835)[0m 2024-06-09 14:56:52.667061: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=16835)[0m 2024-06-09 14:56:52.668850: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=16835)[0m Some wei


Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:57:13. Total running time: 39min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 80%|████████  | 250/312 [00:11<00:03, 16.01it/s] 
 80%|████████  | 251/312 [00:12<00:04, 14.85it/s]
 83%|████████▎ | 258/312 [00:17<00:06,  8.34it/s]
 84%|████████▍ | 262/312 [00:21<00:08,  5.92it/s]
 85%|████████▍ | 265/312 [00:24<00:09,  4.88it/s]
 86%|████████▌ | 267/312 [00:25<00:10,  4.30it/s]
 86%|████████▌ | 268/312 [00:26<00:11,  4.00it/s]
 86%|████████▌ | 269/312 [00:27<00:11,  3.66it/s]
 87%|████████▋ | 270/312 [00:27<00:12,  3.30it/s]
 87%|████████▋ | 271/312 [00:29<00:15,  2.61it/s]
 87%|████████▋ | 272/312 [00:30<00:18,  2.15it/s]
 88%|████████▊ | 273/312 [00:31<00:20,  1.90it/s]
 88%|████████▊ | 274/312 [00:31<00:21,  1.80it/s]
 88%|████████▊ | 275/312 [00:32<00:21,  1.69it/s]
 88%|████████▊ | 276/312 [00:33<00:24,  1.46it/s]
 89%|████████▉ | 277/312 [00:34<00:26,  1.35it/s]
 89%|████████▉ | 278/312 [00:35<00:28,  1.19it/s]
 89%|████████▉ | 279/312 [00:36<00:27,  1.22it/s]
 90%|████████▉ | 280/312 [00:37<00:30,  1.05it/s]
 90%|█████████ | 281/312 [00:38<00:28,  1.09it/s]

Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:57:43. Total running time: 39min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 91%|█████████ | 284/312 [00:40<00:22,  1.26it/s]
 91%|█████████▏| 285/312 [00:41<00:20,  1.30it/s]
 92%|█████████▏| 286/312 [00:42<00:19,  1.32it/s]
 92%|█████████▏| 287/312 [00:43<00:18,  1.35it/s]
 92%|█████████▏| 288/312 [00:43<00:17,  1.36it/s]
 93%|█████████▎| 289/312 [00:44<00:16,  1.37it/s]
 93%|█████████▎| 290/312 [00:45<00:15,  1.38it/s]
 93%|█████████▎| 291/312 [00:45<00:15,  1.39it/s]
 94%|█████████▎| 292/312 [00:46<00:14,  1.40it/s]
 94%|█████████▍| 293/312 [00:47<00:13,  1.40it/s]
 94%|█████████▍| 294/312 [00:48<00:12,  1.40it/s]
 95%|█████████▍| 295/312 [00:48<00:12,  1.41it/s]
 95%|█████████▍| 296/312 [00:49<00:11,  1.41it/s]
 95%|█████████▌| 297/312 [00:50<00:10,  1.41it/s]
 96%|█████████▌| 298/312 [00:50<00:09,  1.41it/s]
 96%|█████████▌| 299/312 [00:51<00:09,  1.41it/s]
 96%|█████████▌| 300/312 [00:52<00:08,  1.42it/s]
 96%|█████████▋| 301/312 [00:52<00:07,  1.42it/s]
 97%|█████████▋| 302/312 [00:53<00:07,  1.42it/s]
 97%|█████████▋| 303/312 [00:54<00:06,  1.43it/s]


[36m(_objective pid=16835)[0m {'eval_loss': 0.7454956769943237, 'eval_acc': 0.6787003610108303, 'eval_runtime': 2.028, 'eval_samples_per_second': 136.585, 'eval_steps_per_second': 4.438, 'epoch': 4.0}




Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:58:13. Total running time: 40min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29 



Trial status: 1 RUNNING | 1 PENDING | 3 PAUSED | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:58:43. Total running time: 40min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29




Trial _objective_165d9_00000 finished iteration 4 at 2024-06-09 14:58:50. Total running time: 40min 45s
Trial _objective_165d9_00000 saved a checkpoint for iteration 4 at: (local)/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000003


[36m(_objective pid=16835)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000003)
[36m(_objective pid=16835)[0m ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('',) kwargs: {}
100%|██████████| 312/312 [01:47<00:00,  2.89it/s]ValueError when calling 'write' on stream (<_io.TextIOWrapper name='/tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/working_dirs/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/stderr' mode='a+' encoding='UTF-8'>). args: ('\r100%|██████████| 312/312 [01:47<00:00,  2.89it/s]',) kwargs: {}
[36m(_objective pid=16835)[0m Va


Trial _objective_165d9_00001 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00001 config             |
+-------------------------------------------------+
| learning_rate                             3e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  16 |
| weight_decay                            0.03599 |
+-------------------------------------------------+


[36m(_objective pid=17410)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000002)
[36m(_objective pid=17410)[0m 2024-06-09 14:58:57.207633: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=17410)[0m 2024-06-09 14:58:57.207680: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=17410)[0m 2024-06-09 14:58:57.209028: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=17410)[0m Some wei


Trial status: 3 PAUSED | 1 RUNNING | 1 PENDING | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:59:13. Total running time: 41min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 81%|████████▏ | 254/312 [00:14<00:04, 13.43it/s] 
 82%|████████▏ | 255/312 [00:14<00:04, 12.64it/s]
 84%|████████▍ | 263/312 [00:20<00:06,  7.75it/s]
 86%|████████▌ | 268/312 [00:23<00:07,  5.96it/s]
 87%|████████▋ | 271/312 [00:25<00:08,  5.06it/s]
 88%|████████▊ | 273/312 [00:27<00:08,  4.48it/s]
 88%|████████▊ | 275/312 [00:28<00:09,  3.91it/s]
 88%|████████▊ | 276/312 [00:29<00:09,  3.62it/s]
 89%|████████▉ | 277/312 [00:30<00:10,  3.30it/s]
 89%|████████▉ | 278/312 [00:30<00:11,  2.98it/s]
 89%|████████▉ | 279/312 [00:31<00:12,  2.65it/s]
 90%|████████▉ | 280/312 [00:32<00:13,  2.38it/s]
 90%|█████████ | 281/312 [00:33<00:14,  2.14it/s]
 90%|█████████ | 282/312 [00:33<00:15,  1.94it/s]
 91%|█████████ | 283/312 [00:34<00:16,  1.79it/s]
 91%|█████████ | 284/312 [00:35<00:16,  1.67it/s]


Trial status: 3 PAUSED | 1 RUNNING | 1 PENDING | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 14:59:43. Total running time: 41min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 91%|█████████▏| 285/312 [00:36<00:17,  1.59it/s]
 92%|█████████▏| 286/312 [00:36<00:17,  1.52it/s]
 92%|█████████▏| 287/312 [00:37<00:16,  1.48it/s]
 92%|█████████▏| 288/312 [00:38<00:16,  1.44it/s]
 93%|█████████▎| 289/312 [00:38<00:16,  1.42it/s]
 93%|█████████▎| 290/312 [00:39<00:15,  1.40it/s]
 93%|█████████▎| 291/312 [00:40<00:15,  1.39it/s]
 94%|█████████▎| 292/312 [00:41<00:14,  1.38it/s]
 94%|█████████▍| 293/312 [00:41<00:13,  1.37it/s]
 94%|█████████▍| 294/312 [00:42<00:13,  1.37it/s]
 95%|█████████▍| 295/312 [00:43<00:12,  1.37it/s]
 95%|█████████▍| 296/312 [00:44<00:11,  1.38it/s]
 95%|█████████▌| 297/312 [00:44<00:10,  1.39it/s]
 96%|█████████▌| 298/312 [00:45<00:10,  1.39it/s]
 96%|█████████▌| 299/312 [00:46<00:09,  1.39it/s]
 96%|█████████▌| 300/312 [00:46<00:08,  1.40it/s]
 96%|█████████▋| 301/312 [00:47<00:07,  1.40it/s]
 97%|█████████▋| 302/312 [00:48<00:07,  1.40it/s]
 97%|█████████▋| 303/312 [00:49<00:06,  1.40it/s]
 97%|█████████▋| 304/312 [00:49<00:05,  1.41it/s]


[36m(_objective pid=17410)[0m {'eval_loss': 0.7454956769943237, 'eval_acc': 0.6787003610108303, 'eval_runtime': 2.0681, 'eval_samples_per_second': 133.938, 'eval_steps_per_second': 4.352, 'epoch': 4.0}


[36m(_objective pid=17410)[0m 
[36m(_objective pid=17410)[0m                                                  
[36m(_objective pid=17410)[0m                                              [A100%|██████████| 312/312 [00:57<00:00,  1.51it/s]
[36m(_objective pid=17410)[0m 100%|██████████| 9/9 [00:01<00:00,  4.79it/s][A
[36m(_objective pid=17410)[0m                                              [A


Trial status: 3 PAUSED | 1 RUNNING | 1 PENDING | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 15:00:13. Total running time: 42min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00001   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29 

[36m(_objective pid=17410)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000003)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.



Trial _objective_165d9_00001 finished iteration 4 at 2024-06-09 15:00:43. Total running time: 42min 37s
Trial _objective_165d9_00001 saved a checkpoint for iteration 4 at: (local)/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000003

Trial status: 4 PAUSED | 1 PENDING | 2 TERMINATED | 1 ERROR
Current time: 2024-06-09 15:00:44. Total running time: 42min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_000

[36m(_objective pid=17929)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000002)
[36m(_objective pid=17929)[0m 2024-06-09 15:00:49.801458: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=17929)[0m 2024-06-09 15:00:49.801505: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=17929)[0m 2024-06-09 15:00:49.802870: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=17929)[0m Some wei


Trial status: 3 PAUSED | 1 RUNNING | 2 TERMINATED | 1 PENDING | 1 ERROR
Current time: 2024-06-09 15:01:14. Total running time: 43min 8s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 82%|████████▏ | 256/312 [00:15<00:05, 11.05it/s]
 83%|████████▎ | 260/312 [00:18<00:06,  8.66it/s]
 84%|████████▍ | 263/312 [00:20<00:06,  7.12it/s]
 85%|████████▍ | 265/312 [00:22<00:07,  6.16it/s]
 85%|████████▌ | 266/312 [00:22<00:08,  5.64it/s]
 86%|████████▌ | 267/312 [00:23<00:08,  5.07it/s]
 86%|████████▌ | 268/312 [00:24<00:09,  4.48it/s]
 86%|████████▌ | 269/312 [00:24<00:10,  3.91it/s]
 87%|████████▋ | 270/312 [00:25<00:12,  3.37it/s]
 87%|████████▋ | 271/312 [00:26<00:14,  2.90it/s]
 87%|████████▋ | 272/312 [00:27<00:15,  2.53it/s]
 88%|████████▊ | 273/312 [00:27<00:17,  2.23it/s]
 88%|████████▊ | 274/312 [00:28<00:18,  2.00it/s]
 88%|████████▊ | 275/312 [00:29<00:20,  1.82it/s]
 88%|████████▊ | 276/312 [00:29<00:21,  1.69it/s]
 89%|████████▉ | 277/312 [00:30<00:21,  1.60it/s]
 89%|████████▉ | 278/312 [00:31<00:22,  1.53it/s]
 89%|████████▉ | 279/312 [00:32<00:22,  1.48it/s]
 90%|████████▉ | 280/312 [00:32<00:22,  1.45it/s]
 90%|█████████ | 281/312 [00:33<00:21,  1.42it/s]


Trial status: 3 PAUSED | 1 RUNNING | 2 TERMINATED | 1 PENDING | 1 ERROR
Current time: 2024-06-09 15:01:44. Total running time: 43min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29

 95%|█████████▍| 295/312 [00:43<00:12,  1.38it/s]
 95%|█████████▍| 296/312 [00:44<00:11,  1.39it/s]
 95%|█████████▌| 297/312 [00:45<00:10,  1.39it/s]
 96%|█████████▌| 298/312 [00:45<00:10,  1.39it/s]
 96%|█████████▌| 299/312 [00:46<00:09,  1.40it/s]
 96%|█████████▌| 300/312 [00:47<00:08,  1.40it/s]
 96%|█████████▋| 301/312 [00:48<00:07,  1.40it/s]
 97%|█████████▋| 302/312 [00:48<00:07,  1.40it/s]
 97%|█████████▋| 303/312 [00:49<00:06,  1.40it/s]
 97%|█████████▋| 304/312 [00:50<00:05,  1.41it/s]
 98%|█████████▊| 305/312 [00:50<00:04,  1.42it/s]
 98%|█████████▊| 306/312 [00:51<00:04,  1.42it/s]
 98%|█████████▊| 307/312 [00:52<00:03,  1.42it/s]
 99%|█████████▊| 308/312 [00:52<00:02,  1.42it/s]
 99%|█████████▉| 309/312 [00:53<00:02,  1.43it/s]
 99%|█████████▉| 310/312 [00:54<00:01,  1.43it/s]
100%|█████████▉| 311/312 [00:55<00:00,  1.43it/s]
100%|██████████| 312/312 [00:55<00:00,  1.52it/s]
[36m(_objective pid=17929)[0m 
  0%|          | 0/9 [00:00<?, ?it/s][A
[36m(_objective pid=17929

[36m(_objective pid=17929)[0m {'eval_loss': 0.7454956769943237, 'eval_acc': 0.6787003610108303, 'eval_runtime': 2.054, 'eval_samples_per_second': 134.857, 'eval_steps_per_second': 4.382, 'epoch': 4.0}


[36m(_objective pid=17929)[0m 
[36m(_objective pid=17929)[0m                                                  
[36m(_objective pid=17929)[0m                                              [A100%|██████████| 312/312 [00:57<00:00,  1.52it/s]
[36m(_objective pid=17929)[0m 100%|██████████| 9/9 [00:01<00:00,  4.86it/s][A
[36m(_objective pid=17929)[0m                                              [A


Trial status: 3 PAUSED | 1 RUNNING | 2 TERMINATED | 1 PENDING | 1 ERROR
Current time: 2024-06-09 15:02:14. Total running time: 44min 9s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                       4        3            271.59       0.69701      0.67148            2.0547                  134.815 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29 

[36m(_objective pid=17929)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000003)



Trial _objective_165d9_00004 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00004 config             |
+-------------------------------------------------+
| learning_rate                             4e-05 |
| max_steps                                    -1 |
| num_train_epochs                              3 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.01692 |
+-------------------------------------------------+


[36m(_objective pid=18424)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00004_4_num_train_epochs=3_2024-06-09_14-18-05/checkpoint_000002)
[36m(_objective pid=18424)[0m 2024-06-09 15:02:36.494876: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=18424)[0m 2024-06-09 15:02:36.494928: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=18424)[0m 2024-06-09 15:02:36.496225: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=18424)[0m Some wei


Trial status: 3 PAUSED | 2 TERMINATED | 1 RUNNING | 1 PENDING | 1 ERROR
Current time: 2024-06-09 15:02:44. Total running time: 44min 39s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00004   RUNNING                       3        3            300.095      0.653172     0.696751           2.0428                  135.596 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.2

[36m(_objective pid=18424)[0m   0%|          | 0/234 [00:00<?, ?it/s]                                         0%|          | 0/234 [00:00<?, ?it/s]  0%|          | 0/234 [00:00<?, ?it/s]
2024-06-09 15:02:52,532	ERROR tune_controller.py:1331 -- Trial task failed for trial _objective_165d9_00005
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2613, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_priva


Trial _objective_165d9_00005 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00005 config             |
+-------------------------------------------------+
| learning_rate                             2e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  16 |
| weight_decay                            0.03599 |
+-------------------------------------------------+

Trial _objective_165d9_00005 errored after 3 iterations at 2024-06-09 15:02:52. Total running time: 44min 47s
Error file: /tmp/ray/session_2024-06-09_14-16-45_081420_699/artifacts/2024-06-09_14-18-05/tune_transformer_pbt/driver_artifacts/_objective_165d9_00005_5_num_train_epochs=3_2024-06-09_14-18-05/error.txt

Trial _objective_165d9_00000 started with configuration:
+-------------------------------------------------+

[36m(_objective pid=18582)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00000_0_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000003)
[36m(_objective pid=18582)[0m 2024-06-09 15:02:59.670284: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=18582)[0m 2024-06-09 15:02:59.670366: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=18582)[0m 2024-06-09 15:02:59.671685: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=18582)[0m Some wei

[36m(_objective pid=18582)[0m {'train_runtime': 0.0761, 'train_samples_per_second': 130866.7, 'train_steps_per_second': 4099.439, 'train_loss': 0.0, 'epoch': 4.0}

Trial _objective_165d9_00000 completed after 4 iterations at 2024-06-09 15:03:11. Total running time: 45min 6s


[36m(_objective pid=18582)[0m   0%|          | 0/312 [00:00<?, ?it/s]                                         0%|          | 0/312 [00:00<?, ?it/s]  0%|          | 0/312 [00:00<?, ?it/s]
[36m(_objective pid=18582)[0m Exception ignored in atexit callback: <function shutdown at 0x79b005ba8a60>
[36m(_objective pid=18582)[0m Traceback (most recent call last):
[36m(_objective pid=18582)[0m   File "/usr/lib/python3.10/logging/__init__.py", line 2183, in shutdown
[36m(_objective pid=18582)[0m     h.close()
[36m(_objective pid=18582)[0m   File "/usr/local/lib/python3.10/dist-packages/absl/logging/__init__.py", line 944, in close
[36m(_objective pid=18582)[0m     self.stream.close()
[36m(_objective pid=18582)[0m AttributeError: 'Tee' object has no attribute 'close'



Trial status: 4 TERMINATED | 1 PENDING | 1 PAUSED | 2 ERROR
Current time: 2024-06-09 15:03:14. Total running time: 45min 9s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   TERMINATED                    4        4            391.03       0.745496     0.6787             2.028                   136.585 |
| _objective_165d9_00003   TERMINATED                    2        2            178.896      0.639288     0.624549           2.0475                  135.29  |
| _objec

[36m(_objective pid=18709)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00001_1_num_train_epochs=5_2024-06-09_14-18-05/checkpoint_000003)
[36m(_objective pid=18709)[0m 2024-06-09 15:03:22.952907: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=18709)[0m 2024-06-09 15:03:22.952977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=18709)[0m 2024-06-09 15:03:22.954912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=18709)[0m Some wei

[36m(_objective pid=18709)[0m {'train_runtime': 0.0054, 'train_samples_per_second': 1861228.24, 'train_steps_per_second': 58303.535, 'train_loss': 0.0, 'epoch': 4.0}

Trial _objective_165d9_00001 completed after 4 iterations at 2024-06-09 15:03:33. Total running time: 45min 28s


[36m(_objective pid=18709)[0m 	per_device_train_batch_size: 16 (from args) != 32 (from trainer_state.json)
[36m(_objective pid=18709)[0m   0%|          | 0/312 [00:00<?, ?it/s]                                         0%|          | 0/312 [00:00<?, ?it/s]
[36m(_objective pid=18709)[0m   0%|          | 0/312 [00:00<?, ?it/s]



Trial _objective_165d9_00002 started with configuration:
+-------------------------------------------------+
| Trial _objective_165d9_00002 config             |
+-------------------------------------------------+
| learning_rate                             3e-05 |
| max_steps                                    -1 |
| num_train_epochs                              4 |
| per_device_eval_batch_size                   32 |
| per_device_train_batch_size                  32 |
| weight_decay                            0.02999 |
+-------------------------------------------------+


[36m(_objective pid=18837)[0m Restored on 172.28.0.12 from checkpoint: Checkpoint(filesystem=local, path=/root/ray_results/tune_transformer_pbt/_objective_165d9_00002_2_num_train_epochs=4_2024-06-09_14-18-05/checkpoint_000003)
[36m(_objective pid=18837)[0m 2024-06-09 15:03:38.525680: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(_objective pid=18837)[0m 2024-06-09 15:03:38.525734: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(_objective pid=18837)[0m 2024-06-09 15:03:38.527004: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(_objective pid=18837)[0m Some wei


Trial status: 5 TERMINATED | 1 RUNNING | 2 ERROR
Current time: 2024-06-09 15:03:44. Total running time: 45min 39s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00002   RUNNING                       4        4            373.426      0.745496     0.6787             2.054                   134.857 |
| _objective_165d9_00000   TERMINATED                    4        4            391.03       0.745496     0.6787             2.028                   136.585 |
| _objective_165d9

2024-06-09 15:03:49,383	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/tune_transformer_pbt' in 0.0097s.



Trial _objective_165d9_00002 completed after 4 iterations at 2024-06-09 15:03:49. Total running time: 45min 44s

Trial status: 6 TERMINATED | 2 ERROR
Current time: 2024-06-09 15:03:49. Total running time: 45min 44s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:T4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         num_train_epochs     iter     total time (s)     eval_loss     eval_acc     eval_runtime     ...amples_per_second |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
| _objective_165d9_00000   TERMINATED                    4        4            391.03       0.745496     0.6787             2.028                   136.585 |
| _objective_165d9_00001   TERMINATED                    4        4        

TuneError: ('Trials did not complete', [_objective_165d9_00005, _objective_165d9_00007])

[36m(_objective pid=18837)[0m {'train_runtime': 0.0043, 'train_samples_per_second': 2308791.193, 'train_steps_per_second': 72323.58, 'train_loss': 0.0, 'epoch': 4.0}


[36m(_objective pid=18837)[0m   0%|          | 0/312 [00:00<?, ?it/s]                                         0%|          | 0/312 [00:00<?, ?it/s]  0%|          | 0/312 [00:00<?, ?it/s]


# pbt_transformers.utils functions

In [None]:
"""Utilities to load and cache data."""

import os
from typing import Callable, Dict
import numpy as np
from transformers import EvalPrediction
from transformers import glue_compute_metrics, glue_output_modes


def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
    """Function from transformers/examples/text-classification/run_glue.py"""
    output_mode = glue_output_modes[task_name]

    def compute_metrics_fn(p: EvalPrediction):
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        metrics = glue_compute_metrics(task_name, preds, p.label_ids)
        return metrics

    return compute_metrics_fn

In [None]:
def download_data(task_name, data_dir="./data"):
    # Download RTE training data
    print("Downloading dataset.")
    import urllib
    import zipfile

    if task_name == "rte":
        url = "https://dl.fbaipublicfiles.com/glue/data/RTE.zip"
    else:
        raise ValueError("Unknown task: {}".format(task_name))
    data_file = os.path.join(data_dir, "{}.zip".format(task_name))
    if not os.path.exists(data_file):
        urllib.request.urlretrieve(url, data_file)
        with zipfile.ZipFile(data_file) as zip_ref:
            zip_ref.extractall(data_dir)
        print("Downloaded data for task {} to {}".format(task_name, data_dir))
    else:
        print(
            "Data already exists. Using downloaded data for task {} from {}".format(
                task_name, data_dir
            )
        )