In [None]:
!pip install datasets transformers evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [None]:
import os
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [None]:
MODEL_NAME      = "roberta-large"
OUTPUT_DIR      = "./roberta-superglue"
NUM_EPOCHS      = 3
LEARNING_RATE   = 2e-5
TRAIN_BATCH     = 16
EVAL_BATCH      = 16
EVAL_STRATEGY   = "epoch"
LOGGING_STEPS   = 100
MAX_LENGTH      = 128

In [None]:
def preprocess_function(examples, text_keys):
    return tokenizer(
        examples[text_keys[0]],
        examples[text_keys[1]],
        truncation=True,
        max_length=MAX_LENGTH
    )

# Metrics for binary and multiclass
def compute_metrics_binary(p):
    preds = p.predictions.argmax(-1)
    acc  = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    f1   = metric_f1.compute(predictions=preds, references=p.label_ids, average="binary")["f1"]
    return {"accuracy": acc, "f1": f1}

def compute_metrics_multiclass(p):
    preds = p.predictions.argmax(-1)
    acc   = metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]
    return {"accuracy": acc}

In [None]:
tokenizer     = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc    = load_metric("accuracy")
metric_f1     = load_metric("f1")

In [None]:
# Load BoolQ dataset
raw_boolq = load_dataset("super_glue", "boolq")

# Tokenize
remove_cols_boolq = [c for c in raw_boolq["train"].column_names if c != "label"]
tokenized_boolq = raw_boolq.map(
    lambda ex: preprocess_function(ex, ("question", "passage")),
    batched=True,
    remove_columns=remove_cols_boolq
)

# Load model for BoolQ
model_boolq = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# Training arguments for BoolQ
args_boolq = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "boolq"),
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Trainer for BoolQ
trainer_boolq = Trainer(
    model=model_boolq,
    args=args_boolq,
    train_dataset=tokenized_boolq["train"],
    eval_dataset=tokenized_boolq["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_binary,
)

# Train & Save BoolQ model
trainer_boolq.train()
trainer_boolq.save_model(os.path.join(OUTPUT_DIR, "boolq"))

# Evaluate BoolQ
metrics_boolq = trainer_boolq.evaluate(tokenized_boolq["validation"])
print("==> BoolQ metrics:", metrics_boolq)


README.md:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

super_glue.py:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_boolq = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamagrabhartixic20[0m ([33msamagrabhartixic20-iiit-hyderabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6691,0.663722,0.621713,0.766736
2,0.6702,0.663917,0.621713,0.766736
3,0.6632,0.663231,0.621713,0.766736


==> BoolQ metrics: {'eval_loss': 0.66372150182724, 'eval_accuracy': 0.6217125382262997, 'eval_f1': 0.7667358099189138, 'eval_runtime': 75.784, 'eval_samples_per_second': 43.149, 'eval_steps_per_second': 2.705, 'epoch': 3.0}


In [None]:
# Load CB dataset
raw_cb = load_dataset("super_glue", "cb")

# Tokenize
remove_cols_cb = [c for c in raw_cb["train"].column_names if c != "label"]
tokenized_cb = raw_cb.map(
    lambda ex: preprocess_function(ex, ("premise", "hypothesis")),
    batched=True,
    remove_columns=remove_cols_cb
)

# Load model for CB
model_cb = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)

# Training arguments for CB
args_cb = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "cb"),
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Trainer for CB
trainer_cb = Trainer(
    model=model_cb,
    args=args_cb,
    train_dataset=tokenized_cb["train"],
    eval_dataset=tokenized_cb["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_multiclass,
)

# Train & Save CB model
trainer_cb.train()
trainer_cb.save_model(os.path.join(OUTPUT_DIR, "cb"))

# Evaluate CB
metrics_cb = trainer_cb.evaluate(tokenized_cb["validation"])
print("==> CB metrics:", metrics_cb)

Downloading data:   0%|          | 0.00/75.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/56 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cb = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.83331,0.678571
2,No log,0.839615,0.678571
3,No log,0.817638,0.660714


==> CB metrics: {'eval_loss': 0.8333104848861694, 'eval_accuracy': 0.6785714285714286, 'eval_runtime': 1.2428, 'eval_samples_per_second': 45.061, 'eval_steps_per_second': 3.219, 'epoch': 3.0}


In [None]:
raw_rte = load_dataset("super_glue", "rte")
remove_cols_rte = [c for c in raw_rte["train"].column_names if c != "label"]
tokenized_rte = raw_rte.map(
    lambda ex: preprocess_function(ex, ("premise", "hypothesis")),
    batched=True,
    remove_columns=remove_cols_rte
)
model_rte = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
args_rte = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "rte"),
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)
trainer_rte = Trainer(
    model=model_rte,
    args=args_rte,
    train_dataset=tokenized_rte["train"],
    eval_dataset=tokenized_rte["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_binary,
)
trainer_rte.train()
trainer_rte.save_model(os.path.join(OUTPUT_DIR, "rte"))
metrics_rte = trainer_rte.evaluate(tokenized_rte["validation"])
print("==> RTE metrics:", metrics_rte)

README.md:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

super_glue.py:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_rte = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamagrabhartixic20[0m ([33msamagrabhartixic20-iiit-hyderabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7101,0.705956,0.472924,0.642157
2,0.6977,0.693618,0.472924,0.642157
3,0.7019,0.695134,0.472924,0.642157


==> RTE metrics: {'eval_loss': 0.7059557437896729, 'eval_accuracy': 0.4729241877256318, 'eval_f1': 0.6421568627450981, 'eval_runtime': 5.835, 'eval_samples_per_second': 47.472, 'eval_steps_per_second': 3.085, 'epoch': 3.0}


In [None]:
#! error
raw_wic = load_dataset("super_glue", "wic")
remove_cols_wic = [c for c in raw_wic["train"].column_names if c != "label"]
tokenized_wic = raw_wic.map(
    lambda ex: preprocess_function(ex, ("sentence1", "sentence2")),
    batched=True,
    remove_columns=remove_cols_wic
)
model_wic = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
args_wic = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "wic"),
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)
trainer_wic = Trainer(
    model=model_wic,
    args=args_wic,
    train_dataset=tokenized_wic["train"],
    eval_dataset=tokenized_wic["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_binary,
)
trainer_wic.train()
trainer_wic.save_model(os.path.join(OUTPUT_DIR, "wic"))
metrics_wic = trainer_wic.evaluate(tokenized_wic["validation"])
print("==> WiC metrics:", metrics_wic)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_wic = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
