In [1]:
import torch
import numpy as np
import evaluate

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune import JupyterNotebookReporter

In [2]:
# Checking whether cuda is on

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Defining small ruBert tokenizer

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [4]:
# Loading dataset and fixing false 'train_test_split'

dataset = load_dataset("csv", data_files="data/text-target.csv")
dataset = dataset["train"]
dataset = dataset.rename_column("target", "labels")

Using custom data configuration default-8049d2a1ed22d6e9
Found cached dataset csv (C:/Users/al_po/.cache/huggingface/datasets/csv/default-8049d2a1ed22d6e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Defining preprocessing for text to get embedding lookup table and attention

def preprocess_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, return_tensors="pt"
    )

In [6]:
dataset = dataset.map(preprocess_function, batched=True)


Loading cached processed dataset at C:/Users/al_po/.cache/huggingface/datasets/csv/default-8049d2a1ed22d6e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-b27e36eba33aa001.arrow


In [7]:
dataset = dataset.train_test_split()


In [8]:
# Leaving only the necessary for training columns

train_dataset = dataset["train"].shuffle(seed=16)
test_dataset = dataset["test"]


In [9]:
# We will max the f1-macro 

metric = evaluate.load("f1")


In [10]:
# Function for trainer evaluation

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [11]:
# for hp search we need to use model_init=... instead of model=...


def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        "cointegrated/rubert-tiny2", num_labels=3
    )

In [12]:
# Configuring Trainer
# Check 'rubert_hp_search' for hyperparameter search example

training_args = TrainingArguments(
    output_dir="rubert_hp",
    evaluation_strategy="epoch",
    disable_tqdm=True,
    log_level="error",
)

trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

In [13]:
# config for ray.tune

tune_config = {
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "num_train_epochs": tune.choice([2, 3, 4, 5]),
}

In [14]:
scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_f1",
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
    },
)


In [15]:
reporter = JupyterNotebookReporter(
    parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "num_train_epochs": "num_epochs",
    },
    metric_columns=["eval_f1", "eval_loss", "epoch", "training_iteration"],
)


0,1
Current time:,2023-04-16 02:19:09
Running for:,00:00:00.05
Memory:,11.2/31.9 GiB

Trial name,status,loc,w_decay,lr,num_epochs
_objective_ec7e7_00000,RUNNING,127.0.0.1:19812,0.238963,1.73374e-05,4


In [16]:
best_run = trainer.hyperparameter_search(
    hp_space=lambda _: tune_config,
    backend="ray",
    n_trials=1,
    scheduler=scheduler,
    progress_reporter=reporter,
    keep_checkpoints_num=1,
    verbose=1,
)

2023-04-16 02:19:06,303	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html

[2m[36m(_objective pid=19812)[0m Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
[2m[36m(_objective pid=19812)[0m - This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertFor

[2m[36m(_objective pid=19812)[0m {'loss': 0.7901, 'learning_rate': 1.6448476931262298e-05, 'epoch': 0.21}
[2m[36m(_objective pid=19812)[0m {'loss': 0.5275, 'learning_rate': 1.555956226787804e-05, 'epoch': 0.41}
[2m[36m(_objective pid=19812)[0m {'loss': 0.528, 'learning_rate': 1.4670647604493784e-05, 'epoch': 0.62}
[2m[36m(_objective pid=19812)[0m {'loss': 0.5432, 'learning_rate': 1.3781732941109525e-05, 'epoch': 0.82}
[2m[36m(_objective pid=19812)[0m {'eval_loss': 0.5704401135444641, 'eval_f1': 0.8468366765174079, 'eval_runtime': 76.8736, 'eval_samples_per_second': 42.277, 'eval_steps_per_second': 10.576, 'epoch': 1.0}
[2m[36m(_objective pid=19812)[0m {'loss': 0.5339, 'learning_rate': 1.2892818277725268e-05, 'epoch': 1.03}
[2m[36m(_objective pid=19812)[0m {'loss': 0.5206, 'learning_rate': 1.200390361434101e-05, 'epoch': 1.23}
[2m[36m(_objective pid=19812)[0m {'loss': 0.5523, 'learning_rate': 1.1114988950956753e-05, 'epoch': 1.44}
[2m[36m(_objective pid=19812)

2023-04-16 03:10:43,058	INFO tune.py:798 -- Total run time: 3094.05 seconds (3094.02 seconds for the tuning loop).


In [17]:
best_run


[2m[36m(_objective pid=19812)[0m {'eval_loss': 0.6095167398452759, 'eval_f1': 0.8554960240823112, 'eval_runtime': 79.67, 'eval_samples_per_second': 40.793, 'eval_steps_per_second': 10.205, 'epoch': 4.0}
[2m[36m(_objective pid=19812)[0m {'train_runtime': 3089.9594, 'train_samples_per_second': 12.622, 'train_steps_per_second': 3.156, 'train_loss': 0.48905094106046754, 'epoch': 4.0}


BestRun(run_id='ec7e7_00000', objective=0.8554960240823112, hyperparameters={'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'num_train_epochs': 4, 'weight_decay': 0.23896289605806983, 'learning_rate': 1.7337391594646555e-05})