# Fine-tune a 🤗 Transformers model

Source: https://docs.ray.io/en/latest/ray-air/examples/huggingface_text_classification.html

## Setup Ray

In [1]:
from pprint import pprint
import ray

if ray.is_initialized():
    ray.shutdown()

ray.init()

pprint(ray.available_resources())

2023-09-06 10:26:11,310	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


{'CPU': 20.0,
 'GPU': 1.0,
 'accelerator_type:G': 1.0,
 'memory': 5915251508.0,
 'node:192.168.33.188': 1.0,
 'node:__internal_head__': 1.0,
 'object_store_memory': 2957625753.0}


By default, we will run the training with one GPU worker.


In [2]:
use_gpu = True  # set this to False to run on CPUs
num_workers = 1  # set this to number of GPUs/CPUs you want to use

## Fine-tuning a model on a text classification task

[Original source](https://github.com/huggingface/notebooks/blob/6ca682955173cc9d36ffa431ddda505a048cbe80/examples/text_classification.ipynb)

In [3]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [4]:
task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

### Loading the dataset

In [5]:
from datasets import load_dataset

actual_task = "mnli" if task == "mnli-mm" else task
datasets = load_dataset("glue", actual_task)

In [6]:
from datasets import load_metric

def load_metric_fn():
    return load_metric('glue', actual_task)

In [7]:
datasets.shape

{'train': (8551, 3), 'validation': (1043, 3), 'test': (1063, 3)}

In [8]:
# Get sample data of datasets
datasets['train'][0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

### Preprocessing the data with Ray AIR 

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [10]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [11]:
import ray.data 

ray_datasets = ray.data.from_huggingface(datasets)
ray_datasets



{'train': MaterializedDataset(
    num_blocks=1,
    num_rows=8551,
    schema={sentence: string, label: int64, idx: int32}
 ),
 'validation': MaterializedDataset(
    num_blocks=1,
    num_rows=1043,
    schema={sentence: string, label: int64, idx: int32}
 ),
 'test': MaterializedDataset(
    num_blocks=1,
    num_rows=1063,
    schema={sentence: string, label: int64, idx: int32}
 )}

In [12]:
import pandas as pd
from ray.data.preprocessors import BatchMapper

def preprocess_function(examples: pd.DataFrame):
    # if we only have one column, we are inferring.
    # no need to tokenize in that case.
    if len(examples.columns) == 1:
        return examples
    
    examples = examples.to_dict("list")
    sentence1_key, sentence2_key = task_to_keys[task]
    if sentence2_key is None:
        ret = tokenizer(examples[sentence1_key], truncation=True)
    else:
        ret = tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)
    # Add back the original columns
    ret = {**examples, **ret}
    return pd.DataFrame.from_dict(ret)

batch_encoder = BatchMapper(preprocess_function, batch_format="pandas")

### Fine-tuning the model with Ray AIR 

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
model_name = model_checkpoint.split("/")[-1]
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
name = f"{model_name}-finetuned-{task}"

def trainer_init_per_worker(train_dataset, eval_dataset = None, **config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    metric = load_metric_fn()
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=config.get("learning_rate", 2e-5),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        disable_tqdm=True,  # declutter the output a little
        no_cuda=not use_gpu,  # you need to explicitly set no_cuda if you want CPUs
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if task != "stsb":
            predictions = np.argmax(predictions, axis=1)
        else:
            predictions = predictions[:, 0]
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    print("Starting training")
    return trainer

2023-09-06 10:26:17.754960: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-06 10:26:17.781166: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
from ray.train.huggingface import TransformersTrainer
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
from ray.air.integrations.mlflow import MLflowLoggerCallback

trainer = TransformersTrainer(
    trainer_init_per_worker=trainer_init_per_worker,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={
        "train": ray_datasets["train"],
        "evaluation": ray_datasets[validation_key],
    },
    run_config=RunConfig(
        callbacks=[MLflowLoggerCallback(experiment_name=name)],
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
    preprocessor=batch_encoder,
)



In [15]:
result = trainer.fit()

0,1
Current time:,2023-09-06 10:28:01
Running for:,00:01:42.64
Memory:,23.6/30.9 GiB

Trial name,status,loc,iter,total time (s),loss,learning_rate,epoch
TransformersTrainer_24dc3_00000,TERMINATED,192.168.33.188:39990,2,97.6608,0.3884,0,2


Traceback (most recent call last):
  File "/home/mpp/.conda/envs/ray/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 304, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/home/mpp/.conda/envs/ray/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 397, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/mpp/.conda/envs/ray/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1306, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/home/mpp/.conda/envs/ray/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1299, in _read_helper
    result = read_yaml(root, file_name)
  File "/home/mpp/.conda/envs/ray/lib/python3.9/site-packages/mlflow/utils/file_utils.py", line 282, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist.")
mlflow.exceptions.Missi

(pid=40033) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=40033) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=40033)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=40033)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=40033)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=40033)[0m 2023-09-06 10:26:23.536698: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(RayTrainWorker pid=40033)[0m 2023-09-06 10:26:23.5

(pid=40033) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=40033) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=40033)[0m Is CUDA available: True


[2m[36m(RayTrainWorker pid=40033)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
[2m[36m(RayTrainWorker pid=40033)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2m[36m(RayTrainWorker pid=40033)[0m Starting training


[2m[36m(RayTrainWorker pid=40033)[0m You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[2m[36m(RayTrainWorker pid=40033)[0m {'loss': 0.5464, 'learning_rate': 1e-05, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=40033)[0m {'eval_loss': 0.5178409218788147, 'eval_matthews_correlation': 0.40338161890982716, 'eval_runtime': 0.871, 'eval_samples_per_second': 1197.518, 'eval_steps_per_second': 75.778, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=40033)[0m {'loss': 0.3884, 'learning_rate': 0.0, 'epoch': 2.0}
[2m[36m(RayTrainWorker pid=40033)[0m {'eval_loss': 0.5500921607017517, 'eval_matthews_correlation': 0.45115517656589194, 'eval_runtime': 0.7425, 'eval_samples_per_second': 1404.739, 'eval_steps_per_second': 88.891, 'epoch': 2.0}
[2m[36m(RayTrainWorker pid=40033)[0m {'train_runtime': 89.372, 'train_samples_per_second': 191.357, 'train_steps_per_second': 11.972, 'train_loss': 0.46742234274605726, 'epoch': 2.0}


2023-09-06 10:28:01,552	INFO tune.py:1148 -- Total run time: 102.67 seconds (102.64 seconds for the tuning loop).


In [16]:
result

Result(
  metrics={'loss': 0.3884, 'learning_rate': 0.0, 'epoch': 2.0, 'step': 1070, 'eval_loss': 0.5500921607017517, 'eval_matthews_correlation': 0.45115517656589194, 'eval_runtime': 0.7425, 'eval_samples_per_second': 1404.739, 'eval_steps_per_second': 88.891, 'train_runtime': 89.372, 'train_samples_per_second': 191.357, 'train_steps_per_second': 11.972, 'train_loss': 0.46742234274605726, 'should_checkpoint': True, 'done': True, 'trial_id': '24dc3_00000', 'experiment_tag': '0'},
  path='/home/mpp/ray_results/TransformersTrainer_2023-09-06_10-26-18/TransformersTrainer_24dc3_00000_0_2023-09-06_10-26-18',
  checkpoint=TransformersCheckpoint(local_path=/home/mpp/ray_results/TransformersTrainer_2023-09-06_10-26-18/TransformersTrainer_24dc3_00000_0_2023-09-06_10-26-18/checkpoint_000001)
)

### Tune hyperparameters with Ray AIR 

In [17]:
from ray import tune
from ray.tune import Tuner
from ray.tune.schedulers.async_hyperband import ASHAScheduler # to avoid the long wait, we use ASHA instead of Hyperband

tune_epochs = 4
tuner = Tuner(
    trainer,
    param_space={
        "trainer_init_config": {
            "learning_rate": tune.grid_search([2e-5, 2e-4, 2e-3, 2e-2]),
            "epochs": tune_epochs,
        }
    },
    tune_config=tune.TuneConfig(
        metric="eval_loss",
        mode="min",
        num_samples=1,
        scheduler=ASHAScheduler(
            max_t=tune_epochs,
        )
    ),
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="eval_loss", checkpoint_score_order="min")
    ),
)

2023-09-06 10:31:53,052	INFO tuner_internal.py:490 -- A `RunConfig` was passed to both the `Tuner` and the `TransformersTrainer`. The run config passed to the `Tuner` is the one that will be used.


In [18]:
tune_results = tuner.fit()

0,1
Current time:,2023-09-06 10:37:17
Running for:,00:05:05.69
Memory:,21.5/30.9 GiB

Trial name,status,loc,trainer_init_config/ learning_rate,iter,total time (s),loss,learning_rate,epoch
TransformersTrainer_f73ae_00000,TERMINATED,192.168.33.188:40811,2e-05,4,161.179,0.1952,0.0,4
TransformersTrainer_f73ae_00001,TERMINATED,192.168.33.188:41182,0.0002,1,42.259,0.6276,0.00015,1
TransformersTrainer_f73ae_00002,TERMINATED,192.168.33.188:41368,0.002,1,43.7642,0.645,0.0015,1
TransformersTrainer_f73ae_00003,TERMINATED,192.168.33.188:41811,0.02,1,41.6672,1.0629,0.015,1


[2m[36m(pid=40811)[0m 2023-09-06 10:32:13.563877: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=40811)[0m 2023-09-06 10:32:13.589844: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[36m(pid=40811)[0m To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(TransformersTrainer pid=40811)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(TransformersTrainer pid=40811)[0m Starting di

(pid=40852) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=40852) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=40852)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=40852)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=40852)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=40852) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=40852) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=40852)[0m Is CUDA available: True


[2m[36m(RayTrainWorker pid=40852)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=40852)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=40852)[0m 2023-09-06 10:32:16.567503: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(RayTrainWorker pid=40852)[0m 2023-09-06 10:32:16.592535: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[

[2m[36m(RayTrainWorker pid=40852)[0m Starting training


[2m[36m(RayTrainWorker pid=40852)[0m You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[2m[36m(RayTrainWorker pid=40852)[0m {'loss': 0.543, 'learning_rate': 1.5000000000000002e-05, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=40852)[0m {'eval_loss': 0.5259996056556702, 'eval_matthews_correlation': 0.36986609141954047, 'eval_runtime': 0.8687, 'eval_samples_per_second': 1200.62, 'eval_steps_per_second': 75.974, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=40852)[0m {'loss': 0.3714, 'learning_rate': 1e-05, 'epoch': 2.0}
[2m[36m(RayTrainWorker pid=40852)[0m {'eval_loss': 0.5449074506759644, 'eval_matthews_correlation': 0.49016253095993895, 'eval_runtime': 0.7507, 'eval_samples_per_second': 1389.394, 'eval_steps_per_second': 87.919, 'epoch': 2.0}
[2m[36m(RayTrainWorker pid=40852)[0m {'loss': 0.254, 'learning_rate': 5e-06, 'epoch': 3.0}
[2m[36m(RayTrainWorker pid=40852)[0m {'eval_loss': 0.6437172293663025, 'eval_matthews_correlation': 0.5448598482839426, 'eval_runtime': 0.8837, 'eval_samples_per_second': 1180.284, 'eval_steps_per_second': 74.687, 'epoch': 3.0}
[2m[

[2m[36m(pid=41182)[0m 2023-09-06 10:34:58.944375: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=41182)[0m 2023-09-06 10:34:58.979472: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[36m(pid=41182)[0m To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(TransformersTrainer pid=41182)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(TransformersTrainer pid=41182)[0m Starting di

(pid=41235) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=41235) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=41235)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=41235)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=41235)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=41235) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=41235) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=41235)[0m Is CUDA available: True


[2m[36m(RayTrainWorker pid=41235)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=41235)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2m[36m(RayTrainWorker pid=41235)[0m Starting training


[2m[36m(RayTrainWorker pid=41235)[0m You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[2m[36m(RayTrainWorker pid=41235)[0m {'loss': 0.6276, 'learning_rate': 0.00015000000000000001, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=41235)[0m {'eval_loss': 0.6183143854141235, 'eval_matthews_correlation': 0.0, 'eval_runtime': 0.7995, 'eval_samples_per_second': 1304.487, 'eval_steps_per_second': 82.547, 'epoch': 1.0}


[2m[36m(pid=41368)[0m 2023-09-06 10:35:45.990693: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=41368)[0m 2023-09-06 10:35:46.025602: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[36m(pid=41368)[0m To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(TransformersTrainer pid=41368)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(TransformersTrainer pid=41368)[0m Starting di

(pid=41408) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=41408) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=41408)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=41408)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=41408)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=41408) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=41408) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=41408)[0m Is CUDA available: True


[2m[36m(RayTrainWorker pid=41408)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
[2m[36m(RayTrainWorker pid=41408)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2m[36m(RayTrainWorker pid=41408)[0m Starting training


[2m[36m(RayTrainWorker pid=41408)[0m You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[2m[36m(RayTrainWorker pid=41408)[0m {'loss': 0.645, 'learning_rate': 0.0015, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=41408)[0m {'eval_loss': 0.6195238828659058, 'eval_matthews_correlation': 0.0, 'eval_runtime': 0.7714, 'eval_samples_per_second': 1352.131, 'eval_steps_per_second': 85.562, 'epoch': 1.0}


[2m[36m(pid=41811)[0m 2023-09-06 10:36:34.134151: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=41811)[0m 2023-09-06 10:36:34.174607: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[36m(pid=41811)[0m To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(TransformersTrainer pid=41811)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(TransformersTrainer pid=41811)[0m Starting di

(pid=41865) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=41865) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=41865)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=41865)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=41865)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=41865) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=41865) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]



[2m[36m(RayTrainWorker pid=41865)[0m Is CUDA available: True


[2m[36m(RayTrainWorker pid=41865)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
[2m[36m(RayTrainWorker pid=41865)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2m[36m(RayTrainWorker pid=41865)[0m Starting training


[2m[36m(RayTrainWorker pid=41865)[0m You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[2m[36m(RayTrainWorker pid=41865)[0m {'loss': 1.0629, 'learning_rate': 0.015, 'epoch': 1.0}
[2m[36m(RayTrainWorker pid=41865)[0m {'eval_loss': 0.6182685494422913, 'eval_matthews_correlation': 0.0, 'eval_runtime': 0.7531, 'eval_samples_per_second': 1384.939, 'eval_steps_per_second': 87.638, 'epoch': 1.0}


2023-09-06 10:37:17,524	INFO tune.py:1148 -- Total run time: 305.70 seconds (305.69 seconds for the tuning loop).


In [19]:
tune_results.get_dataframe().sort_values("eval_loss")

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_matthews_correlation,eval_runtime,eval_samples_per_second,eval_steps_per_second,timestamp,...,date,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/trainer_init_config/epochs,config/trainer_init_config/learning_rate,logdir
3,1.0629,0.015,1.0,535,0.618269,0.0,0.7531,1384.939,87.638,1693971437,...,2023-09-06_10-37-17,41.667228,41811,fedora,192.168.33.188,41.667228,1,4,0.02,/home/mpp/ray_results/TransformersTrainer_2023...
1,0.6276,0.00015,1.0,535,0.618314,0.0,0.7995,1304.487,82.547,1693971342,...,2023-09-06_10-35-42,42.259003,41182,fedora,192.168.33.188,42.259003,1,4,0.0002,/home/mpp/ray_results/TransformersTrainer_2023...
2,0.645,0.0015,1.0,535,0.619524,0.0,0.7714,1352.131,85.562,1693971391,...,2023-09-06_10-36-31,43.764205,41368,fedora,192.168.33.188,43.764205,1,4,0.002,/home/mpp/ray_results/TransformersTrainer_2023...
0,0.1952,0.0,4.0,2140,0.741759,0.518192,0.7605,1371.498,86.787,1693971295,...,2023-09-06_10-34-56,161.178861,40811,fedora,192.168.33.188,161.178861,4,4,2e-05,/home/mpp/ray_results/TransformersTrainer_2023...


In [20]:
best_result = tune_results.get_best_result()

In [21]:
best_result

Result(
  metrics={'loss': 1.0629, 'learning_rate': 0.015, 'epoch': 1.0, 'step': 535, 'eval_loss': 0.6182685494422913, 'eval_matthews_correlation': 0.0, 'eval_runtime': 0.7531, 'eval_samples_per_second': 1384.939, 'eval_steps_per_second': 87.638, 'should_checkpoint': True, 'done': True, 'trial_id': 'f73ae_00003', 'experiment_tag': '3_learning_rate=0.0200'},
  path='/home/mpp/ray_results/TransformersTrainer_2023-09-06_10-31-53/TransformersTrainer_f73ae_00003_3_learning_rate=0.0200_2023-09-06_10-32-11',
  checkpoint=TransformersCheckpoint(local_path=/home/mpp/ray_results/TransformersTrainer_2023-09-06_10-31-53/TransformersTrainer_f73ae_00003_3_learning_rate=0.0200_2023-09-06_10-32-11/checkpoint_000000)
)

Now, load the model and tokenizer locally, and recreate the 🤗 Transformers Trainer:

In [22]:
from ray.train.huggingface import TransformersCheckpoint

checkpoint = TransformersCheckpoint.from_checkpoint(result.checkpoint)
hf_trainer = checkpoint.get_model(model=AutoModelForSequenceClassification)

In [23]:
# ------ Shutdown Ray ------    
ray

<module 'ray' from '/home/mpp/.conda/envs/ray/lib/python3.9/site-packages/ray/__init__.py'>