In [None]:
!pip install datasets transformers evaluate

import os
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31

In [None]:
MODEL_NAME    = "roberta-large"
OUTPUT_DIR    = "./roberta-superglue"
NUM_EPOCHS    = 3
LEARNING_RATE = 2e-5
TRAIN_BATCH   = 16
EVAL_BATCH    = 16
EVAL_STRATEGY = "epoch"
LOGGING_STEPS = 100
MAX_LENGTH    = 128

# Tokenizer and data collator
tokenizer      = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics
metric_acc = load_metric("accuracy")
metric_f1  = load_metric("f1")

def compute_metrics_binary(p):
    preds = p.predictions.argmax(-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "f1":       metric_f1.compute(predictions=preds, references=p.label_ids, average="binary")["f1"]
    }

def compute_metrics_multiclass(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": metric_acc.compute(predictions=preds, references=p.label_ids)["accuracy"]}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [None]:
raw_copa = load_dataset("super_glue", "copa")

# %%
def preprocess_copa(examples):
    first, second, labels = [], [], []
    for premise, q, c1, c2, label in zip(
        examples["premise"], examples["question"], examples["choice1"], examples["choice2"], examples["label"]
    ):
        first.extend([premise, premise])
        second.extend([q + " " + c1, q + " " + c2])
        labels.append(label)

    # Add padding=True to ensure consistent lengths
    tokenized = tokenizer(
        first,
        second,
        truncation=True,
        padding="max_length",  # Add padding
        max_length=MAX_LENGTH
    )

    # Reshape the outputs for multiple choice format
    input_ids = [tokenized["input_ids"][i:i+2] for i in range(0, len(tokenized["input_ids"]), 2)]
    attention_mask = [tokenized["attention_mask"][i:i+2] for i in range(0, len(tokenized["attention_mask"]), 2)]

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# %%
tokenized_copa = raw_copa.map(
    preprocess_copa,
    batched=True,
    remove_columns=raw_copa["train"].column_names
)

# %%
model_copa = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME)
# Replace evaluation_strategy and save_strategy with their correct parameter names
args_copa = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "copa"),
    eval_strategy=EVAL_STRATEGY,  # Changed from evaluation_strategy
    save_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)
trainer_copa = Trainer(
    model=model_copa,
    args=args_copa,
    train_dataset=tokenized_copa["train"],
    eval_dataset=tokenized_copa["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_binary,
)
trainer_copa.train()
trainer_copa.save_model(os.path.join(OUTPUT_DIR, "copa"))
metrics_copa = trainer_copa.evaluate(tokenized_copa["validation"])
print("COPA metrics:", metrics_copa)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_copa = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.668028,0.72,0.688889
2,No log,0.417673,0.86,0.851064
3,No log,0.335121,0.87,0.860215


COPA metrics: {'eval_loss': 0.3351214528083801, 'eval_accuracy': 0.87, 'eval_f1': 0.8602150537634409, 'eval_runtime': 4.0247, 'eval_samples_per_second': 24.847, 'eval_steps_per_second': 1.739, 'epoch': 3.0}


In [None]:
raw_multirc = load_dataset("super_glue", "multirc")
def preprocess_multirc(examples):
    flat_first, flat_second, flat_labels = [], [], []

    # Using correct column names based on the output
    for paragraph, question, answer, label in zip(
        examples["paragraph"], examples["question"], examples["answer"], examples["label"]
    ):
        flat_first.append(paragraph)
        flat_second.append(question + " " + answer)  # Using singular "answer"
        flat_labels.append(label)

    # Add padding to ensure consistent lengths
    tokenized = tokenizer(
        flat_first,
        flat_second,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )
    tokenized["labels"] = flat_labels
    return tokenized

# Use the corrected preprocessing function
tokenized_multirc = raw_multirc.map(
    preprocess_multirc,
    batched=True,
    remove_columns=raw_multirc["train"].column_names
)

model_multirc = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Fix training arguments
args_multirc = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "multirc"),
    # Fix the parameter mismatch
    do_eval=True,
    eval_steps=LOGGING_STEPS,
    save_steps=LOGGING_STEPS,
    # Ensure both strategies match by setting them explicitly
    eval_strategy="steps",   # Make these match
    save_strategy="steps",   # Make these match
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer_multirc = Trainer(
    model=model_multirc,
    args=args_multirc,
    train_dataset=tokenized_multirc["train"],
    eval_dataset=tokenized_multirc["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_binary,
)

# Train the model
trainer_multirc.train()

# Save the model
trainer_multirc.save_model(os.path.join(OUTPUT_DIR, "multirc"))

# Evaluate the model on the validation set
metrics_multirc = trainer_multirc.evaluate(tokenized_multirc["validation"])
print("MultiRC metrics:", metrics_multirc)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_multirc = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.7131,0.683004,0.571988,0.0
200,0.6963,0.683669,0.571988,0.0
300,0.7002,0.683928,0.571988,0.0
400,0.6953,0.682971,0.571988,0.0
500,0.6955,0.687695,0.571988,0.0
600,0.6937,0.69655,0.428012,0.599451
700,0.6899,0.682774,0.571988,0.0
800,0.693,0.683501,0.571988,0.0
900,0.6824,0.685729,0.571988,0.0
1000,0.6878,0.682792,0.571988,0.0


KeyboardInterrupt: 

In [None]:
# Load the ReCoRD dataset
raw_record = load_dataset("super_glue", "record")

# Check column names to verify structure
print("Column names:", raw_record["train"].column_names)

def preprocess_record(examples):
    first, second, labels = [], [], []

    # ReCoRD has "answers" instead of "label"
    for passage, query, entities, answers in zip(
        examples["passage"], examples["query"], examples["entities"], examples["answers"]
    ):
        # Skip examples with no answers (for validation/test sets that might have no answer)
        if len(answers) == 0:
            continue

        first.append([passage] * len(entities))
        second.append([query.replace("@placeholder@", ent) for ent in entities])

        # Find which entity matches any of the answers
        answer_positions = []
        for i, entity in enumerate(entities):
            if entity in answers:
                answer_positions.append(i)

        # If no entity matches answers, skip this example
        if not answer_positions:
            continue

        # Use the first matching position as the label
        labels.append(answer_positions[0])

    # Add padding to ensure consistent lengths
    tokenized = tokenizer(
        first,
        second,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )
    tokenized["labels"] = labels  # Use "labels" instead of "label" for Trainer compatibility
    return tokenized

# Apply the preprocessing
tokenized_record = raw_record.map(
    preprocess_record,
    batched=True,
    remove_columns=raw_record["train"].column_names
)

# Continue with the rest of your code, but fix the TrainingArguments
model_record = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME)
args_record = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "record"),
    # Fix the strategy parameters
    do_eval=True,
    eval_steps=LOGGING_STEPS,
    save_steps=LOGGING_STEPS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=LOGGING_STEPS,
    save_total_limit=2,
    # Remove these if they cause issues
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
)

# Use the appropriate data collator for multiple choice
from transformers import DataCollatorForMultipleChoice
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

trainer_record = Trainer(
    model=model_record,
    args=args_record,
    train_dataset=tokenized_record["train"],
    eval_dataset=tokenized_record["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_multiclass,
)
trainer_record.train()
trainer_record.save_model(os.path.join(OUTPUT_DIR, "record"))
metrics_record = trainer_record.evaluate(tokenized_record["validation"])
print("ReCoRD metrics:", metrics_record)