# Finetune a BERT Text Classifier with LightningTrainer

This is an advanced example for LightningTrainer, which demonstrates how to use LightningTrainer with Dataset.

If you just want to quickly convert your existing PyTorch Lightning scripts into Ray AIR, you can refer to this starter example: Train a Pytorch Lightning Image Classifier.

Source: https://docs.ray.io/en/latest/train/examples/lightning/lightning_cola_advanced.html

In [1]:
import ray
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, load_metric
import numpy as np

## Pre-process CoLA Dataset

In [2]:
dataset = load_dataset("glue", "cola")
metric = load_metric("glue", "cola")

ray_datasets = ray.data.from_huggingface(dataset)

  metric = load_metric("glue", "cola")
2023-09-07 08:56:16,707	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 192.168.33.188:6379...
2023-09-07 08:56:16,711	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [3]:
from ray.data.preprocessors import BatchMapper

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_sentence(batch):
    encoded_sent = tokenizer(
        batch["sentence"].tolist(),
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    batch["input_ids"] = encoded_sent["input_ids"].numpy()
    batch["attention_mask"] = encoded_sent["attention_mask"].numpy()
    batch["label"] = np.array(batch["label"])
    batch.pop("sentence")
    return batch


preprocessor = BatchMapper(tokenize_sentence, batch_format="numpy")

## Define a PyTorch Lightning Model

In [4]:
class SentimentModel(pl.LightningModule):
    def __init__(self, lr=2e-5, eps=1e-8):
        super().__init__()
        self.lr = lr
        self.eps = eps # epsilon
        self.num_classes = 2
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-cased", num_labels=self.num_classes
        )
        self.metric = load_metric("glue", "cola")
        self.predictions = []
        self.references = []

    def forward(self, batch):
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        outputs = self.model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        logits = self.forward(batch)
        loss = F.cross_entropy(logits.view(-1, self.num_classes), labels)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        logits = self.forward(batch)
        preds = torch.argmax(logits, dim=1)
        self.predictions.append(preds)
        self.references.append(labels)

    def on_validation_epoch_end(self):
        predictions = torch.concat(self.predictions).view(-1)
        references = torch.concat(self.references).view(-1)
        matthews_correlation = self.metric.compute(
            predictions=predictions, references=references
        )

        # self.metric.compute() returns a dictionary:
        # e.g. {"matthews_correlation": 0.53}
        self.log_dict(matthews_correlation, sync_dist=True)
        self.predictions.clear()
        self.references.clear()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)

## Configure your LightningTrainer

In [5]:
from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig

# Define the configs for LightningTrainer
lightning_config = (
    LightningConfigBuilder()
    .module(cls=SentimentModel, lr=1e-5, eps=1e-8)
    .trainer(max_epochs=5, accelerator="gpu")
    .checkpointing(save_on_train_epoch_end=False)
    .build()
)

In [6]:
# Save AIR checkpoints according to the performance on validation set
run_config = RunConfig(
    name="ptl-sent-classification",
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="matthews_correlation",
        checkpoint_score_order="max",
    ),
)

# Scale the DDP training workload across 4 GPUs
# You can change this config based on your compute resources.
scaling_config = ScalingConfig(
    num_workers=1, use_gpu=True, resources_per_worker={"CPU": 15, "GPU": 1}
)

## Fine-tune the model with LightningTrainer

In [7]:
trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=run_config,
    scaling_config=scaling_config,
    datasets={"train": ray_datasets["train"], "val": ray_datasets["validation"]},
    datasets_iter_config={"batch_size": 32},
    preprocessor=preprocessor,
)
result = trainer.fit()

0,1
Current time:,2023-09-07 09:09:37
Running for:,00:12:58.27
Memory:,19.3/30.9 GiB

Trial name,status,loc,iter,total time (s),train_loss,matthews_correlation,epoch
LightningTrainer_c9055_00000,RUNNING,192.168.33.188:18987,3,641.976,0.167995,0.548925,2


[2m[36m(LightningTrainer pid=18987)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(LightningTrainer pid=18987)[0m Starting distributed worker processes: ['19030 (192.168.33.188)']
[2m[36m(RayTrainWorker pid=19030)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=19030)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=19030)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=19030)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=19030)[0m TPU available: False, using: 0 TPU cores
[2m[36m(

(pid=19030) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  4.79it/s]


[2m[36m(RayTrainWorker pid=19030)[0m   rank_zero_warn(
[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=19030) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…

Epoch 0: : 1it [00:23, 23.06s/it, v_num=0]
Epoch 0: : 2it [00:23, 11.63s/it, v_num=0]
Epoch 0: : 3it [00:23,  7.93s/it, v_num=0]
Epoch 0: : 4it [00:24,  6.19s/it, v_num=0]
Epoch 0: : 5it [00:25,  5.04s/it, v_num=0]
Epoch 0: : 6it [00:26,  4.36s/it, v_num=0]
Epoch 0: : 7it [00:26,  3.79s/it, v_num=0]
Epoch 0: : 8it [00:27,  3.44s/it, v_num=0]
Epoch 0: : 9it [00:28,  3.15s/it, v_num=0]
Epoch 0: : 10it [00:28,  2.87s/it, v_num=0]
Epoch 0: : 11it [00:29,  2.70s/it, v_num=0]
Epoch 0: : 12it [00:30,  2.51s/it, v_num=0]
Epoch 0: : 13it [00:31,  2.39s/it, v_num=0]
Epoch 0: : 14it [00:31,  2.24s/it, v_num=0]
Epoch 0: : 15it [00:31,  2.11s/it, v_num=0]
Epoch 0: : 16it [00:31,  2.00s/it, v_num=0]
Epoch 0: : 17it [00:32,  1.90s/it, v_num=0]
Epoch 0: : 18it [00:32,  1.81s/it, v_num=0]
Epoch 0: : 19it [00:32,  1.73s/it, v_num=0]
Epoch 0: : 20it [00:33,  1.66s/it, v_num=0]
Epoch 0: : 21it [00:33,  1.59s/it, v_num=0]
Epoch 0: : 22it [00:33,  1.54s/it, v_num=0]
Epoch 0: : 23it [00:34,  1.48s/it, v_num=



Epoch 0: : 47it [00:40,  1.15it/s, v_num=0]
Epoch 0: : 48it [00:41,  1.16it/s, v_num=0]
Epoch 0: : 49it [00:41,  1.18it/s, v_num=0]
Epoch 0: : 50it [00:41,  1.19it/s, v_num=0]
Epoch 0: : 51it [00:42,  1.21it/s, v_num=0]
Epoch 0: : 52it [00:42,  1.23it/s, v_num=0]
Epoch 0: : 53it [00:42,  1.24it/s, v_num=0]
Epoch 0: : 54it [00:43,  1.26it/s, v_num=0]
Epoch 0: : 55it [00:43,  1.27it/s, v_num=0]
Epoch 0: : 56it [00:43,  1.29it/s, v_num=0]
Epoch 0: : 57it [00:43,  1.30it/s, v_num=0]
Epoch 0: : 58it [00:44,  1.31it/s, v_num=0]
Epoch 0: : 59it [00:44,  1.33it/s, v_num=0]
Epoch 0: : 60it [00:44,  1.34it/s, v_num=0]
Epoch 0: : 61it [00:44,  1.36it/s, v_num=0]
Epoch 0: : 62it [00:45,  1.37it/s, v_num=0]
Epoch 0: : 63it [00:45,  1.38it/s, v_num=0]
Epoch 0: : 64it [00:45,  1.40it/s, v_num=0]
Epoch 0: : 65it [00:46,  1.41it/s, v_num=0]
Epoch 0: : 66it [00:46,  1.42it/s, v_num=0]
Epoch 0: : 67it [00:46,  1.43it/s, v_num=0]
Epoch 0: : 68it [00:47,  1.45it/s, v_num=0]
Epoch 0: : 69it [00:47,  1.46it/



Epoch 0: : 185it [01:20,  2.29it/s, v_num=0]
Epoch 0: : 186it [01:21,  2.29it/s, v_num=0]
Epoch 0: : 187it [01:21,  2.30it/s, v_num=0]
Epoch 0: : 188it [01:21,  2.30it/s, v_num=0]
Epoch 0: : 189it [01:22,  2.30it/s, v_num=0]
Epoch 0: : 190it [01:22,  2.31it/s, v_num=0]
Epoch 0: : 191it [01:22,  2.31it/s, v_num=0]
Epoch 0: : 192it [01:22,  2.32it/s, v_num=0]
Epoch 0: : 193it [01:23,  2.32it/s, v_num=0]
Epoch 0: : 194it [01:23,  2.32it/s, v_num=0]
Epoch 0: : 195it [01:23,  2.33it/s, v_num=0]
Epoch 0: : 196it [01:24,  2.33it/s, v_num=0]
Epoch 0: : 197it [01:24,  2.34it/s, v_num=0]
Epoch 0: : 198it [01:24,  2.34it/s, v_num=0]
Epoch 0: : 199it [01:24,  2.34it/s, v_num=0]
Epoch 0: : 200it [01:25,  2.35it/s, v_num=0]
Epoch 0: : 201it [01:25,  2.35it/s, v_num=0]
Epoch 0: : 202it [01:25,  2.35it/s, v_num=0]
Epoch 0: : 203it [01:26,  2.36it/s, v_num=0]
Epoch 0: : 204it [01:26,  2.36it/s, v_num=0]
Epoch 0: : 205it [01:26,  2.37it/s, v_num=0]
Epoch 0: : 206it [01:26,  2.37it/s, v_num=0]
Epoch 0: :




Epoch 0: : 289it [01:51,  2.60it/s, v_num=0]
Epoch 0: : 290it [01:51,  2.60it/s, v_num=0]
Epoch 0: : 291it [01:51,  2.61it/s, v_num=0]
Epoch 0: : 292it [01:51,  2.61it/s, v_num=0]
Epoch 0: : 293it [01:52,  2.61it/s, v_num=0]
Epoch 0: : 294it [01:52,  2.61it/s, v_num=0]
Epoch 0: : 295it [01:52,  2.62it/s, v_num=0]
Epoch 0: : 296it [01:53,  2.62it/s, v_num=0]
Epoch 0: : 297it [01:53,  2.62it/s, v_num=0]
Epoch 0: : 298it [01:53,  2.62it/s, v_num=0]
Epoch 0: : 299it [01:53,  2.62it/s, v_num=0]
Epoch 0: : 300it [01:54,  2.63it/s, v_num=0]
Epoch 0: : 301it [01:54,  2.63it/s, v_num=0]
Epoch 0: : 302it [01:54,  2.63it/s, v_num=0]
Epoch 0: : 303it [01:55,  2.63it/s, v_num=0]
Epoch 0: : 304it [01:55,  2.63it/s, v_num=0]
Epoch 0: : 305it [01:55,  2.64it/s, v_num=0]
Epoch 0: : 306it [01:55,  2.64it/s, v_num=0]
Epoch 0: : 307it [01:56,  2.64it/s, v_num=0]
Epoch 0: : 308it [01:56,  2.64it/s, v_num=0]
Epoch 0: : 309it [01:56,  2.64it/s, v_num=0]
Epoch 0: : 310it [01:57,  2.65it/s, v_num=0]
Epoch 0: 



Epoch 0: : 444it [02:36,  2.84it/s, v_num=0]
Epoch 0: : 445it [02:36,  2.84it/s, v_num=0]
Epoch 0: : 446it [02:37,  2.84it/s, v_num=0]
Epoch 0: : 447it [02:37,  2.84it/s, v_num=0]
Epoch 0: : 448it [02:37,  2.84it/s, v_num=0]
Epoch 0: : 449it [02:37,  2.84it/s, v_num=0]
Epoch 0: : 450it [02:38,  2.84it/s, v_num=0]
Epoch 0: : 451it [02:38,  2.84it/s, v_num=0]
Epoch 0: : 452it [02:38,  2.85it/s, v_num=0]
Epoch 0: : 453it [02:39,  2.85it/s, v_num=0]
Epoch 0: : 454it [02:39,  2.85it/s, v_num=0]
Epoch 0: : 455it [02:39,  2.85it/s, v_num=0]
Epoch 0: : 456it [02:40,  2.85it/s, v_num=0]
Epoch 0: : 457it [02:40,  2.85it/s, v_num=0]
Epoch 0: : 458it [02:40,  2.85it/s, v_num=0]
Epoch 0: : 459it [02:40,  2.85it/s, v_num=0]
Epoch 0: : 460it [02:41,  2.85it/s, v_num=0]
Epoch 0: : 461it [02:41,  2.85it/s, v_num=0]
Epoch 0: : 462it [02:41,  2.86it/s, v_num=0]
Epoch 0: : 463it [02:42,  2.86it/s, v_num=0]
Epoch 0: : 464it [02:42,  2.86it/s, v_num=0]
Epoch 0: : 465it [02:42,  2.86it/s, v_num=0]
Epoch 0: :



Epoch 0: : 502it [02:53,  2.89it/s, v_num=0]
Epoch 0: : 503it [02:53,  2.89it/s, v_num=0]
Epoch 0: : 504it [02:54,  2.89it/s, v_num=0]
Epoch 0: : 505it [02:54,  2.89it/s, v_num=0]
Epoch 0: : 506it [02:54,  2.89it/s, v_num=0]
Epoch 0: : 507it [02:55,  2.89it/s, v_num=0]
Epoch 0: : 508it [02:55,  2.90it/s, v_num=0]
Epoch 0: : 509it [02:55,  2.90it/s, v_num=0]
Epoch 0: : 510it [02:56,  2.90it/s, v_num=0]
Epoch 0: : 511it [02:56,  2.90it/s, v_num=0]
Epoch 0: : 512it [02:56,  2.90it/s, v_num=0]
Epoch 0: : 513it [02:56,  2.90it/s, v_num=0]
Epoch 0: : 514it [02:57,  2.90it/s, v_num=0]
Epoch 0: : 515it [02:57,  2.90it/s, v_num=0]
Epoch 0: : 516it [02:57,  2.90it/s, v_num=0]
Epoch 0: : 517it [02:58,  2.90it/s, v_num=0]
Epoch 0: : 518it [02:58,  2.90it/s, v_num=0]
Epoch 0: : 519it [02:58,  2.91it/s, v_num=0]
Epoch 0: : 520it [02:58,  2.91it/s, v_num=0]
Epoch 0: : 521it [02:59,  2.91it/s, v_num=0]
Epoch 0: : 522it [02:59,  2.91it/s, v_num=0]
Epoch 0: : 523it [02:59,  2.91it/s, v_num=0]
Epoch 0: :

[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=19030) Running 0:   0%|          | 0/1 [00:08<?, ?it/s]

IOStream.flush timed out


(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [01:4…

[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`



[2m[36m(RayTrainWorker pid=19030)[0m 
Validation: 0it [00:00, ?it/s][A0)[0m 
Validation: 0it [00:00, ?it/s][A0)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 128.65it/s][A
Validation DataLoader 0: : 2it [00:00, 133.24it/s][A
Validation DataLoader 0: : 3it [00:00, 141.27it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 4it [00:00, 15.79it/s] [A
Validation DataLoader 0: : 5it [00:00, 19.14it/s][A
Validation DataLoader 0: : 6it [00:00, 22.44it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 7it [00:00, 10.96it/s][A
Validation DataLoader 0: : 8it [00:00, 12.38it/s][A
Validation DataLoader 0: : 9it [00:00, 13.79it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 10it [00:01,  7.90it/s][A
Validation DataLoader 0: : 11it [00:01,  8.64it/s][A
Validation DataLoader 0: : 12it [00:01,  9.38it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader



(pid=19030) Running 0:   0%|          | 0/1 [00:04<?, ?it/s]



(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…

Epoch 1: : 1it [00:10, 10.05s/it, v_num=0]
Epoch 1: : 2it [00:10,  5.17s/it, v_num=0]
Epoch 1: : 3it [00:10,  3.54s/it, v_num=0]
Epoch 1: : 4it [00:10,  2.73s/it, v_num=0]
Epoch 1: : 5it [00:11,  2.24s/it, v_num=0]
Epoch 1: : 6it [00:11,  1.92s/it, v_num=0]
Epoch 1: : 7it [00:11,  1.69s/it, v_num=0]
Epoch 1: : 8it [00:12,  1.51s/it, v_num=0]
Epoch 1: : 9it [00:12,  1.38s/it, v_num=0]
Epoch 1: : 10it [00:12,  1.27s/it, v_num=0]
Epoch 1: : 11it [00:12,  1.18s/it, v_num=0]
Epoch 1: : 12it [00:13,  1.10s/it, v_num=0]
Epoch 1: : 13it [00:13,  1.04s/it, v_num=0]
Epoch 1: : 14it [00:13,  1.01it/s, v_num=0]
Epoch 1: : 15it [00:14,  1.06it/s, v_num=0]
Epoch 1: : 16it [00:14,  1.11it/s, v_num=0]
Epoch 1: : 17it [00:14,  1.16it/s, v_num=0]
Epoch 1: : 18it [00:14,  1.20it/s, v_num=0]
Epoch 1: : 19it [00:15,  1.24it/s, v_num=0]
Epoch 1: : 20it [00:15,  1.28it/s, v_num=0]
Epoch 1: : 21it [00:15,  1.32it/s, v_num=0]
Epoch 1: : 22it [00:16,  1.36it/s, v_num=0]
Epoch 1: : 23it [00:16,  1.40it/s, v_num=

[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Epoch 1: : 524it [02:43,  3.20it/s, v_num=0]
Epoch 1: : 525it [02:44,  3.20it/s, v_num=0]
Epoch 1: : 526it [02:44,  3.20it/s, v_num=0]
Epoch 1: : 527it [02:44,  3.20it/s, v_num=0]
Epoch 1: : 528it [02:44,  3.20it/s, v_num=0]
Epoch 1: : 529it [02:45,  3.20it/s, v_num=0]
Epoch 1: : 530it [02:45,  3.20it/s, v_num=0]
Epoch 1: : 531it [02:45,  3.20it/s, v_num=0]
Epoch 1: : 532it [02:46,  3.20it/s, v_num=0]
Epoch 1: : 533it [02:46,  3.20it/s, v_num=0]
Epoch 1: : 534it [02:46,  3.20it/s, v_num=0]




(pid=19030) Running 0:   0%|          | 0/1 [00:07<?, ?it/s]

(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…



[2m[36m(RayTrainWorker pid=19030)[0m 
Validation: 0it [00:00, ?it/s][A0)[0m 
Validation: 0it [00:00, ?it/s][A0)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 135.82it/s][A
Validation DataLoader 0: : 2it [00:00, 139.20it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 3it [00:00, 145.25it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 4it [00:00, 15.45it/s] [A
Validation DataLoader 0: : 5it [00:00, 18.76it/s][A
Validation DataLoader 0: : 6it [00:00, 21.99it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 7it [00:00, 11.10it/s][A
Validation DataLoader 0: : 8it [00:00, 12.52it/s][A
Validation DataLoader 0: : 9it [00:00, 13.94it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 10it [00:01,  7.96it/s][A
Validation DataLoader 0: : 11it [00:01,  8.71it/s][A
Validation DataLoader 0: : 12it [00:01,  9.45it/s][A
[2m[36m(RayTrainWor

[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=19030) Running 0:   0%|          | 0/1 [00:03<?, ?it/s]



(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…

Epoch 2: : 1it [00:11, 11.31s/it, v_num=0]
Epoch 2: : 2it [00:12,  6.14s/it, v_num=0]
Epoch 2: : 3it [00:12,  4.21s/it, v_num=0]
Epoch 2: : 4it [00:13,  3.42s/it, v_num=0]
Epoch 2: : 5it [00:14,  2.91s/it, v_num=0]
Epoch 2: : 6it [00:14,  2.49s/it, v_num=0]
Epoch 2: : 7it [00:15,  2.28s/it, v_num=0]
Epoch 2: : 8it [00:16,  2.04s/it, v_num=0]
Epoch 2: : 9it [00:17,  1.92s/it, v_num=0]
Epoch 2: : 10it [00:17,  1.76s/it, v_num=0]
Epoch 2: : 11it [00:17,  1.63s/it, v_num=0]
Epoch 2: : 12it [00:18,  1.57s/it, v_num=0]
Epoch 2: : 13it [00:19,  1.47s/it, v_num=0]
Epoch 2: : 14it [00:19,  1.39s/it, v_num=0]
Epoch 2: : 15it [00:19,  1.32s/it, v_num=0]
Epoch 2: : 16it [00:20,  1.25s/it, v_num=0]
Epoch 2: : 17it [00:20,  1.20s/it, v_num=0]
Epoch 2: : 18it [00:20,  1.15s/it, v_num=0]
Epoch 2: : 19it [00:20,  1.10s/it, v_num=0]
Epoch 2: : 20it [00:21,  1.06s/it, v_num=0]
Epoch 2: : 21it [00:21,  1.03s/it, v_num=0]
Epoch 2: : 22it [00:21,  1.01it/s, v_num=0]
Epoch 2: : 23it [00:22,  1.04it/s, v_num=

[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=19030) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…

[2m[36m(RayTrainWorker pid=19030)[0m 
Validation: 0it [00:00, ?it/s][A0)[0m 
Validation: 0it [00:00, ?it/s][A0)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 76.52it/s][A
Validation DataLoader 0: : 2it [00:00, 93.35it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 3it [00:00, 105.77it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 4it [00:00, 16.63it/s] [A
Validation DataLoader 0: : 5it [00:00, 20.14it/s][A
Validation DataLoader 0: : 6it [00:00, 23.56it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 7it [00:00, 13.40it/s][A
Validation DataLoader 0: : 8it [00:00, 15.10it/s][A
Validation DataLoader 0: : 9it [00:00, 16.79it/s][A
[2m[36m(RayTrainWorker pid=19030)[0m 
Validation DataLoader 0: : 10it [00:01,  8.69it/s][A
Validation DataLoader 0: : 11it [00:01,  9.50it/s][A
Validation DataLoader 0: : 12it [00:01, 10.31it/s][A
[2m[36m(RayTrainWorke

[2m[36m(RayTrainWorker pid=19030)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=19030)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=19030)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=19030) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=19030) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:0…

Epoch 3: : 1it [00:19, 19.61s/it, v_num=0]
Epoch 3: : 2it [00:19,  9.95s/it, v_num=0]
Epoch 3: : 3it [00:20,  6.73s/it, v_num=0]
Epoch 3: : 4it [00:20,  5.12s/it, v_num=0]
Epoch 3: : 5it [00:20,  4.16s/it, v_num=0]
Epoch 3: : 6it [00:21,  3.52s/it, v_num=0]
Epoch 3: : 7it [00:21,  3.05s/it, v_num=0]
Epoch 3: : 8it [00:21,  2.71s/it, v_num=0]
Epoch 3: : 9it [00:21,  2.44s/it, v_num=0]
Epoch 3: : 10it [00:22,  2.22s/it, v_num=0]
Epoch 3: : 11it [00:22,  2.05s/it, v_num=0]
Epoch 3: : 12it [00:22,  1.90s/it, v_num=0]
Epoch 3: : 13it [00:23,  1.78s/it, v_num=0]
Epoch 3: : 14it [00:23,  1.67s/it, v_num=0]
Epoch 3: : 15it [00:23,  1.58s/it, v_num=0]
Epoch 3: : 16it [00:23,  1.50s/it, v_num=0]
Epoch 3: : 17it [00:24,  1.43s/it, v_num=0]
Epoch 3: : 18it [00:24,  1.37s/it, v_num=0]
Epoch 3: : 19it [00:24,  1.31s/it, v_num=0]
Epoch 3: : 20it [00:25,  1.26s/it, v_num=0]
Epoch 3: : 21it [00:25,  1.21s/it, v_num=0]
Epoch 3: : 22it [00:25,  1.17s/it, v_num=0]
Epoch 3: : 23it [00:26,  1.13s/it, v_num=



Epoch 3: : 400it [02:14,  2.98it/s, v_num=0]
Epoch 3: : 401it [02:14,  2.98it/s, v_num=0]
Epoch 3: : 402it [02:14,  2.98it/s, v_num=0]
Epoch 3: : 403it [02:15,  2.98it/s, v_num=0]
Epoch 3: : 404it [02:15,  2.99it/s, v_num=0]
Epoch 3: : 405it [02:15,  2.99it/s, v_num=0]
Epoch 3: : 406it [02:15,  2.99it/s, v_num=0]
Epoch 3: : 407it [02:16,  2.99it/s, v_num=0]
Epoch 3: : 408it [02:16,  2.99it/s, v_num=0]
Epoch 3: : 409it [02:16,  2.99it/s, v_num=0]
Epoch 3: : 410it [02:17,  2.99it/s, v_num=0]
Epoch 3: : 411it [02:17,  2.99it/s, v_num=0]
Epoch 3: : 412it [02:17,  2.99it/s, v_num=0]
Epoch 3: : 413it [02:17,  3.00it/s, v_num=0]
Epoch 3: : 414it [02:18,  3.00it/s, v_num=0]
Epoch 3: : 415it [02:18,  3.00it/s, v_num=0]
Epoch 3: : 416it [02:18,  3.00it/s, v_num=0]
Epoch 3: : 417it [02:19,  3.00it/s, v_num=0]
Epoch 3: : 418it [02:19,  3.00it/s, v_num=0]
Epoch 3: : 419it [02:19,  3.00it/s, v_num=0]
Epoch 3: : 420it [02:19,  3.00it/s, v_num=0]
Epoch 3: : 421it [02:20,  3.00it/s, v_num=0]
Epoch 3: :

2023-09-07 09:09:47,979	INFO tune.py:1148 -- Total run time: 788.30 seconds (778.27 seconds for the tuning loop).
Resume training with: Trainer.restore(path="/home/mpp/ray_results/ptl-sent-classification", ...)


Epoch 3: : 435it [02:24,  3.02it/s, v_num=0]


[2m[36m(LightningTrainer pid=18987)[0m Traceback (most recent call last):
[2m[36m(LightningTrainer pid=18987)[0m   File "python/ray/_raylet.pyx", line 1364, in ray._raylet.execute_task.function_executor
[2m[36m(LightningTrainer pid=18987)[0m   File "/home/mpp/.conda/envs/ray-torch/lib/python3.9/site-packages/ray/_private/function_manager.py", line 726, in actor_method_executor
[2m[36m(LightningTrainer pid=18987)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(LightningTrainer pid=18987)[0m   File "/home/mpp/.conda/envs/ray-torch/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 464, in _resume_span
[2m[36m(LightningTrainer pid=18987)[0m     return method(self, *_args, **_kwargs)
[2m[36m(LightningTrainer pid=18987)[0m   File "/home/mpp/.conda/envs/ray-torch/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 372, in train
[2m[36m(LightningTrainer pid=18987)[0m     result = self.step()
[2m[36m(LightningTrainer pid=18

In [8]:
result

Result(
  metrics={'_report_on': 'validation_end', 'train_loss': 0.08543746918439865, 'matthews_correlation': 0.5930452712523209, 'epoch': 4, 'step': 2675, 'should_checkpoint': True, 'done': True, 'trial_id': '9082c_00000', 'experiment_tag': '0'},
  path='/home/dino/ray_results/ptl-sent-classification/LightningTrainer_9082c_00000_0_2023-09-06_23-51-03',
  checkpoint=LightningCheckpoint(local_path=/home/dino/ray_results/ptl-sent-classification/LightningTrainer_9082c_00000_0_2023-09-06_23-51-03/checkpoint_000004)
)