# Finetune a BERT Text Classifier with LightningTrainer

This is an advanced example for LightningTrainer, which demonstrates how to use LightningTrainer with Dataset.

If you just want to quickly convert your existing PyTorch Lightning scripts into Ray AIR, you can refer to this starter example: Train a Pytorch Lightning Image Classifier.

Source: https://docs.ray.io/en/latest/train/examples/lightning/lightning_cola_advanced.html

In [1]:
import ray
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, load_metric
import numpy as np

## Pre-process CoLA Dataset

In [2]:
dataset = load_dataset("glue", "cola")
metric = load_metric("glue", "cola")

ray_datasets = ray.data.from_huggingface(dataset)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

  metric = load_metric("glue", "cola")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

2023-09-06 23:45:55,467	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


In [3]:
from ray.data.preprocessors import BatchMapper

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_sentence(batch):
    encoded_sent = tokenizer(
        batch["sentence"].tolist(),
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    batch["input_ids"] = encoded_sent["input_ids"].numpy()
    batch["attention_mask"] = encoded_sent["attention_mask"].numpy()
    batch["label"] = np.array(batch["label"])
    batch.pop("sentence")
    return batch


preprocessor = BatchMapper(tokenize_sentence, batch_format="numpy")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

## Define a PyTorch Lightning Model

In [4]:
class SentimentModel(pl.LightningModule):
    def __init__(self, lr=2e-5, eps=1e-8):
        super().__init__()
        self.lr = lr
        self.eps = eps # epsilon
        self.num_classes = 2
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-cased", num_labels=self.num_classes
        )
        self.metric = load_metric("glue", "cola")
        self.predictions = []
        self.references = []

    def forward(self, batch):
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        outputs = self.model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

    def training_step(self, batch, batch_idx):
        labels = batch["label"]
        logits = self.forward(batch)
        loss = F.cross_entropy(logits.view(-1, self.num_classes), labels)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        labels = batch["label"]
        logits = self.forward(batch)
        preds = torch.argmax(logits, dim=1)
        self.predictions.append(preds)
        self.references.append(labels)

    def on_validation_epoch_end(self):
        predictions = torch.concat(self.predictions).view(-1)
        references = torch.concat(self.references).view(-1)
        matthews_correlation = self.metric.compute(
            predictions=predictions, references=references
        )

        # self.metric.compute() returns a dictionary:
        # e.g. {"matthews_correlation": 0.53}
        self.log_dict(matthews_correlation, sync_dist=True)
        self.predictions.clear()
        self.references.clear()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)

## Configure your LightningTrainer

In [5]:
from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig

# Define the configs for LightningTrainer
lightning_config = (
    LightningConfigBuilder()
    .module(cls=SentimentModel, lr=1e-5, eps=1e-8)
    .trainer(max_epochs=5, accelerator="gpu")
    .checkpointing(save_on_train_epoch_end=False)
    .build()
)

In [6]:
# Save AIR checkpoints according to the performance on validation set
run_config = RunConfig(
    name="ptl-sent-classification",
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="matthews_correlation",
        checkpoint_score_order="max",
    ),
)

# Scale the DDP training workload across 4 GPUs
# You can change this config based on your compute resources.
scaling_config = ScalingConfig(
    num_workers=1, use_gpu=True, resources_per_worker={"CPU": 10, "GPU": 1}
)

## Fine-tune the model with LightningTrainer

In [7]:
trainer = LightningTrainer(
    lightning_config=lightning_config,
    run_config=run_config,
    scaling_config=scaling_config,
    datasets={"train": ray_datasets["train"], "val": ray_datasets["validation"]},
    datasets_iter_config={"batch_size": 16},
    preprocessor=preprocessor,
)
result = trainer.fit()

0,1
Current time:,2023-09-07 00:00:27
Running for:,00:09:24.56
Memory:,18.1/31.2 GiB

Trial name,status,loc,iter,total time (s),train_loss,matthews_correlation,epoch
LightningTrainer_9082c_00000,TERMINATED,192.168.1.147:768021,5,558.713,0.0854375,0.593045,4


[2m[36m(LightningTrainer pid=768021)[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.
[2m[36m(LightningTrainer pid=768021)[0m Starting distributed worker processes: ['768112 (192.168.1.147)']
[2m[36m(RayTrainWorker pid=768112)[0m Setting up process group for: env:// [rank=0, world_size=1]
Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]
Downloading model.safetensors:   2%|▏         | 10.5M/436M [00:01<00:48, 8.81MB/s]
Downloading model.safetensors:   5%|▍         | 21.0M/436M [00:02<00:46, 8.97MB/s]
Downloading model.safetensors:   7%|▋         | 31.5M/436M [00:03<00:47, 8.47MB/s]
Downloading model.safetensors:  10%|▉         | 41.9M/436M [00:04<00:45, 8.57MB/s]
Downloading model.safetensors:  12%|█▏        | 52.4M/436M [00:06<00:44, 8.63MB/s]
Downloading model.safetensors:  14%|█▍        | 6

(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.68it/s]
Epoch 0: : 0it [00:00, ?it/s]                                              


[2m[36m(RayTrainWorker pid=768112)[0m   rank_zero_warn(
[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 0: : 1it [00:00,  1.61it/s, v_num=0]
Epoch 0: : 2it [00:00,  2.67it/s, v_num=0]
Epoch 0: : 3it [00:00,  3.27it/s, v_num=0]
Epoch 0: : 4it [00:01,  3.68it/s, v_num=0]
Epoch 0: : 5it [00:01,  3.98it/s, v_num=0]
Epoch 0: : 6it [00:01,  4.21it/s, v_num=0]
Epoch 0: : 7it [00:01,  4.39it/s, v_num=0]
Epoch 0: : 8it [00:01,  4.54it/s, v_num=0]
Epoch 0: : 9it [00:01,  4.66it/s, v_num=0]
Epoch 0: : 10it [00:02,  4.76it/s, v_num=0]
Epoch 0: : 11it [00:02,  4.84it/s, v_num=0]
Epoch 0: : 12it [00:02,  4.92it/s, v_num=0]
Epoch 0: : 13it [00:02,  4.98it/s, v_num=0]
Epoch 0: : 14it [00:02,  5.03it/s, v_num=0]
Epoch 0: : 15it [00:02,  5.09it/s, v_num=0]
Epoch 0: : 16it [00:03,  5.13it/s, v_num=0]
Epoch 0: : 17it [00:03,  5.18it/s, v_num=0]
Epoch 0: : 18it [00:03,  5.21it/s, v_num=0]
Epoch 0: : 19it [00:03,  5.24it/s, v_num=0]
Epoch 0: : 20it [00:03,  5.27it/s, v_num=0]
Epoch 0: : 21it [00:03,  5.30it/s, v_num=0]
Epoch 0: : 22it [00:04,  5.32it/s, v_num=0]
Epoch 0: : 23it [00:04,  5.35it/s, v_num=

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

[2m[36m(RayTrainWorker pid=768112)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 89.44it/s][A
Validation DataLoader 0: : 2it [00:00, 94.02it/s][A
Validation DataLoader 0: : 3it [00:00, 100.99it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 4it [00:00, 27.10it/s] [A
Validation DataLoader 0: : 5it [00:00, 32.02it/s][A
Validation DataLoader 0: : 6it [00:00, 36.65it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 7it [00:00, 24.38it/s][A
Validation DataLoader 0: : 8it [00:00, 26.97it/s][A
Validation DataLoader 0: : 9it [00:00, 29.53it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 10it [00:00, 23.48it/s][A
Validation DataLoader 0: : 11it [00:00, 25.30it/s][A
Validation DataLoader 0: : 12it [00:00, 27.11it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataL

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

Epoch 1: : 1it [00:00,  1.50it/s, v_num=0]
Epoch 1: : 2it [00:00,  2.40it/s, v_num=0]
Epoch 1: : 3it [00:00,  3.00it/s, v_num=0]
Epoch 1: : 4it [00:01,  3.44it/s, v_num=0]
Epoch 1: : 5it [00:01,  3.75it/s, v_num=0]
Epoch 1: : 6it [00:01,  4.00it/s, v_num=0]
Epoch 1: : 7it [00:01,  4.19it/s, v_num=0]
Epoch 1: : 8it [00:01,  4.35it/s, v_num=0]
Epoch 1: : 9it [00:02,  4.49it/s, v_num=0]
Epoch 1: : 10it [00:02,  4.60it/s, v_num=0]
Epoch 1: : 11it [00:02,  4.70it/s, v_num=0]
Epoch 1: : 12it [00:02,  4.79it/s, v_num=0]
Epoch 1: : 13it [00:02,  4.85it/s, v_num=0]
Epoch 1: : 14it [00:02,  4.92it/s, v_num=0]
Epoch 1: : 15it [00:03,  4.98it/s, v_num=0]
Epoch 1: : 16it [00:03,  5.03it/s, v_num=0]
Epoch 1: : 17it [00:03,  5.07it/s, v_num=0]
Epoch 1: : 18it [00:03,  5.12it/s, v_num=0]
Epoch 1: : 19it [00:03,  5.16it/s, v_num=0]
Epoch 1: : 20it [00:03,  5.19it/s, v_num=0]
Epoch 1: : 21it [00:04,  5.22it/s, v_num=0]
Epoch 1: : 22it [00:04,  5.25it/s, v_num=0]
Epoch 1: : 23it [00:04,  5.28it/s, v_num=

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

[2m[36m(RayTrainWorker pid=768112)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 89.78it/s][A
Validation DataLoader 0: : 2it [00:00, 94.04it/s][A
Validation DataLoader 0: : 3it [00:00, 99.89it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 4it [00:00, 26.48it/s][A
Validation DataLoader 0: : 5it [00:00, 31.18it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 6it [00:00, 35.54it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 7it [00:00, 24.27it/s][A
Validation DataLoader 0: : 8it [00:00, 26.88it/s][A
Validation DataLoader 0: : 9it [00:00, 29.39it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 10it [00:00, 23.48it/s][A
Validation DataLoader 0: : 11it [00:00, 25.31it/s][A
Validation DataLoader 0: : 12it [00:00, 27.12it/s][A
[2m[36m(RayTrain

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

Epoch 2: : 1it [00:00,  1.76it/s, v_num=0]
Epoch 2: : 2it [00:00,  2.72it/s, v_num=0]
Epoch 2: : 3it [00:00,  3.33it/s, v_num=0]
Epoch 2: : 4it [00:01,  3.75it/s, v_num=0]
Epoch 2: : 5it [00:01,  4.05it/s, v_num=0]
Epoch 2: : 6it [00:01,  4.29it/s, v_num=0]
Epoch 2: : 7it [00:01,  4.47it/s, v_num=0]
Epoch 2: : 8it [00:01,  4.62it/s, v_num=0]
Epoch 2: : 9it [00:01,  4.74it/s, v_num=0]
Epoch 2: : 10it [00:02,  4.84it/s, v_num=0]
Epoch 2: : 11it [00:02,  4.93it/s, v_num=0]
Epoch 2: : 12it [00:02,  5.01it/s, v_num=0]
Epoch 2: : 13it [00:02,  5.07it/s, v_num=0]
Epoch 2: : 14it [00:02,  5.13it/s, v_num=0]
Epoch 2: : 15it [00:02,  5.18it/s, v_num=0]
Epoch 2: : 16it [00:03,  5.22it/s, v_num=0]
Epoch 2: : 17it [00:03,  5.26it/s, v_num=0]
Epoch 2: : 18it [00:03,  5.30it/s, v_num=0]
Epoch 2: : 19it [00:03,  5.33it/s, v_num=0]
Epoch 2: : 20it [00:03,  5.36it/s, v_num=0]
Epoch 2: : 21it [00:03,  5.38it/s, v_num=0]
Epoch 2: : 22it [00:04,  5.41it/s, v_num=0]
Epoch 2: : 23it [00:04,  5.43it/s, v_num=

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=768112)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 102.47it/s][A
Validation DataLoader 0: : 2it [00:00, 106.34it/s][A
Validation DataLoader 0: : 3it [00:00, 111.63it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 4it [00:00, 26.87it/s] [A
Validation DataLoader 0: : 5it [00:00, 31.63it/s][A
Validation DataLoader 0: : 6it [00:00, 36.11it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 7it [00:00, 24.31it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 8it [00:00, 26.90it/s][A
Validation DataLoader 0: : 9it [00:00, 29.41it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 10it [00:00, 23.36it/s][A
Validation DataLoader 0: : 11it [00:00, 25.14it/s][A
Validation DataLoader 0: : 12it [00:00, 26.92it/s][A
[2m[36m(RayT

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

Epoch 3: : 1it [00:00,  1.53it/s, v_num=0]
Epoch 3: : 2it [00:00,  2.43it/s, v_num=0]
Epoch 3: : 3it [00:00,  3.04it/s, v_num=0]
Epoch 3: : 4it [00:01,  3.46it/s, v_num=0]
Epoch 3: : 5it [00:01,  3.78it/s, v_num=0]
Epoch 3: : 6it [00:01,  4.03it/s, v_num=0]
Epoch 3: : 7it [00:01,  4.22it/s, v_num=0]
Epoch 3: : 8it [00:01,  4.39it/s, v_num=0]
Epoch 3: : 9it [00:01,  4.52it/s, v_num=0]
Epoch 3: : 10it [00:02,  4.63it/s, v_num=0]
Epoch 3: : 11it [00:02,  4.73it/s, v_num=0]
Epoch 3: : 12it [00:02,  4.81it/s, v_num=0]
Epoch 3: : 13it [00:02,  4.89it/s, v_num=0]
Epoch 3: : 14it [00:02,  4.95it/s, v_num=0]
Epoch 3: : 15it [00:02,  5.01it/s, v_num=0]
Epoch 3: : 16it [00:03,  5.06it/s, v_num=0]
Epoch 3: : 17it [00:03,  5.10it/s, v_num=0]
Epoch 3: : 18it [00:03,  5.15it/s, v_num=0]
Epoch 3: : 19it [00:03,  5.19it/s, v_num=0]
Epoch 3: : 20it [00:03,  5.22it/s, v_num=0]
Epoch 3: : 21it [00:04,  5.25it/s, v_num=0]
Epoch 3: : 22it [00:04,  5.28it/s, v_num=0]
Epoch 3: : 23it [00:04,  5.31it/s, v_num=

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

[2m[36m(RayTrainWorker pid=768112)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 89.28it/s][A
Validation DataLoader 0: : 2it [00:00, 95.04it/s][A
Validation DataLoader 0: : 3it [00:00, 103.83it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 4it [00:00, 26.94it/s] [A
Validation DataLoader 0: : 5it [00:00, 31.86it/s][A
Validation DataLoader 0: : 6it [00:00, 36.46it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 7it [00:00, 24.43it/s][A
Validation DataLoader 0: : 8it [00:00, 27.04it/s][A
Validation DataLoader 0: : 9it [00:00, 29.64it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 10it [00:00, 23.19it/s][A
Validation DataLoader 0: : 11it [00:00, 24.20it/s][A
Validation DataLoader 0: : 12it [00:00, 25.36it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataL

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

Epoch 4: : 1it [00:00,  1.48it/s, v_num=0]
Epoch 4: : 2it [00:00,  2.36it/s, v_num=0]
Epoch 4: : 3it [00:01,  2.95it/s, v_num=0]
Epoch 4: : 4it [00:01,  3.38it/s, v_num=0]
Epoch 4: : 5it [00:01,  3.68it/s, v_num=0]
Epoch 4: : 6it [00:01,  3.93it/s, v_num=0]
Epoch 4: : 7it [00:01,  4.13it/s, v_num=0]
Epoch 4: : 8it [00:01,  4.30it/s, v_num=0]
Epoch 4: : 9it [00:02,  4.42it/s, v_num=0]
Epoch 4: : 10it [00:02,  4.53it/s, v_num=0]
Epoch 4: : 11it [00:02,  4.62it/s, v_num=0]
Epoch 4: : 12it [00:02,  4.71it/s, v_num=0]
Epoch 4: : 13it [00:02,  4.79it/s, v_num=0]
Epoch 4: : 14it [00:02,  4.85it/s, v_num=0]
Epoch 4: : 15it [00:03,  4.90it/s, v_num=0]
Epoch 4: : 16it [00:03,  4.96it/s, v_num=0]
Epoch 4: : 17it [00:03,  5.01it/s, v_num=0]
Epoch 4: : 18it [00:03,  5.05it/s, v_num=0]
Epoch 4: : 20it [00:04,  4.92it/s, v_num=0]
Epoch 4: : 21it [00:04,  4.96it/s, v_num=0]
Epoch 4: : 22it [00:04,  5.00it/s, v_num=0]
Epoch 4: : 23it [00:04,  5.04it/s, v_num=0]
Epoch 4: : 24it [00:04,  5.06it/s, v_num=

[2m[36m(RayTrainWorker pid=768112)[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_numpy)] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=768112)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=768112)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=768112) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(pid=768112) - RandomizeBlockOrder: 0 active, 0 queued, 0.0 MiB objects, 0 output 1:   0%|          | 0/1 [00:…

[2m[36m(RayTrainWorker pid=768112)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation: 0it [00:00, ?it/s][A12)[0m 
Validation DataLoader 0: : 0it [00:00, ?it/s][A
Validation DataLoader 0: : 1it [00:00, 99.27it/s][A
Validation DataLoader 0: : 2it [00:00, 99.76it/s][A
Validation DataLoader 0: : 3it [00:00, 101.74it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 4it [00:00, 26.38it/s] [A
Validation DataLoader 0: : 5it [00:00, 31.09it/s][A
Validation DataLoader 0: : 6it [00:00, 35.43it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 7it [00:00, 24.04it/s][A
Validation DataLoader 0: : 8it [00:00, 26.61it/s][A
Validation DataLoader 0: : 9it [00:00, 29.13it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataLoader 0: : 10it [00:00, 23.21it/s][A
Validation DataLoader 0: : 11it [00:00, 25.02it/s][A
Validation DataLoader 0: : 12it [00:00, 26.82it/s][A
[2m[36m(RayTrainWorker pid=768112)[0m 
Validation DataL

[2m[36m(RayTrainWorker pid=768112)[0m `Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: : 535it [01:41,  5.29it/s, v_num=0]


2023-09-07 00:00:27,703	INFO tune.py:1148 -- Total run time: 564.58 seconds (564.56 seconds for the tuning loop).


In [8]:
result

Result(
  metrics={'_report_on': 'validation_end', 'train_loss': 0.08543746918439865, 'matthews_correlation': 0.5930452712523209, 'epoch': 4, 'step': 2675, 'should_checkpoint': True, 'done': True, 'trial_id': '9082c_00000', 'experiment_tag': '0'},
  path='/home/dino/ray_results/ptl-sent-classification/LightningTrainer_9082c_00000_0_2023-09-06_23-51-03',
  checkpoint=LightningCheckpoint(local_path=/home/dino/ray_results/ptl-sent-classification/LightningTrainer_9082c_00000_0_2023-09-06_23-51-03/checkpoint_000004)
)