In [None]:

# Install required libraries (uncomment if not already installed)
!pip install transformers datasets evaluate

import os
import numpy as np
import evaluate
import torch

from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    RobertaForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
    EvalPrediction
)


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [None]:

MODEL_NAME = "roberta-large"
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [None]:

# A generic tokenization function for sentence pair tasks. Adjust keys as needed.
def tokenize_function(examples, text_fields):
    return tokenizer(examples[text_fields[0]], examples[text_fields[1]] if len(text_fields) > 1 else None, truncation=True)

# Example metric: accuracy for classification tasks
def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": np.mean(preds == p.label_ids)}


In [None]:

# Load the WiC dataset from SuperGLUE.
wic_dataset = load_dataset("super_glue", "wic")

# Preprocess the dataset
# WiC uses two sentences: "sentence1" and "sentence2", and the target word marked in "word"
def preprocess_wic(examples):
    # Here we use padding to max_length; you may also choose 'longest' or a specific length.
    inputs = tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="max_length",  # Pads to the maximum length defined by the tokenizer
        max_length=128         # Adjust max_length as needed.
    )
    inputs["labels"] = examples["label"]
    return inputs

wic_encoded = wic_dataset.map(preprocess_wic, batched=True)


# Load a RoBERTa-large classification head (num_labels=2)
wic_model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Define training arguments
wic_training_args = TrainingArguments(
    output_dir="./wic_roberta_large",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=50,
)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

wic_trainer = Trainer(
    model=wic_model,
    args=wic_training_args,
    train_dataset=wic_encoded["train"],
    eval_dataset=wic_encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # Use the collator with padding.
    compute_metrics=compute_accuracy,
)

# To train the model, uncomment the following line:
wic_trainer.train()


Map:   0%|          | 0/5428 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  wic_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7022,0.693933,0.5
2,0.6981,0.693704,0.5
3,0.6958,0.693281,0.5


TrainOutput(global_step=1020, training_loss=0.6992830987070121, metrics={'train_runtime': 1254.1227, 'train_samples_per_second': 12.984, 'train_steps_per_second': 0.813, 'total_flos': 3793892580108288.0, 'train_loss': 0.6992830987070121, 'epoch': 3.0})

In [None]:

# Load the WSC dataset from SuperGLUE.
wsc_dataset = load_dataset("super_glue", "wsc.fixed")

def preprocess_wsc(examples):
    # Tokenize the text field with a simple truncation.
    inputs = tokenizer(
        examples["text"],
        truncation=True,
        # Optionally, you can set padding here to a fixed max_length, e.g., padding="max_length", max_length=128
    )
    inputs["labels"] = examples["label"]
    return inputs

wsc_encoded = wsc_dataset.map(preprocess_wsc, batched=True)



# Load the RoBERTa-large classification model
wsc_model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

wsc_training_args = TrainingArguments(
    output_dir="./wsc_roberta_large",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # smaller batch sizes might be needed
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=50,
)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

wsc_trainer = Trainer(
    model=wsc_model,
    args=wsc_training_args,
    train_dataset=wsc_encoded["train"],
    eval_dataset=wsc_encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # Using dynamic padding
    compute_metrics=compute_accuracy,
)

# To train the model, uncomment the following line:
wsc_trainer.train()


Map:   0%|          | 0/554 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  wsc_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7189,0.661882,0.634615
2,0.7098,0.716847,0.365385
3,0.7022,0.690987,0.634615


TrainOutput(global_step=417, training_loss=0.7054966942583628, metrics={'train_runtime': 156.9633, 'train_samples_per_second': 10.588, 'train_steps_per_second': 2.657, 'total_flos': 146564408667768.0, 'train_loss': 0.7054966942583628, 'epoch': 3.0})

In [None]:
#! error
copa_dataset = load_dataset("super_glue", "copa")

# Cell 5: Preprocessing function for COPA (batched version)
def preprocess_copa_batch(examples):
    first_sentences = []
    second_sentences = []
    labels = []

    for premise, choice1, choice2, question, label in zip(
        examples["premise"],
        examples["choice1"],
        examples["choice2"],
        examples["question"],
        examples["label"]
    ):
        if question == "cause":
            first_sentences += [choice1, choice2]
            second_sentences += [premise, premise]
        else:
            first_sentences += [premise, premise]
            second_sentences += [choice1, choice2]

        # Original label is 0 or 1 — mark correct one
        labels += [int(label == 0), int(label == 1)]

    tokenized = tokenizer(first_sentences, second_sentences, truncation=True, padding=True)
    tokenized["labels"] = labels
    return tokenized

# Cell 6: Apply preprocessing
copa_encoded = copa_dataset.map(preprocess_copa_batch, batched=True)

# Cell 7: Evaluation metric
def compute_accuracy(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# Cell 8: Training arguments
training_args = TrainingArguments(
    output_dir="./copa_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)

# Cell 9: Data collator with padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Cell 10: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=copa_encoded["train"],
    eval_dataset=copa_encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_accuracy,
)

# Cell 11: Train the model
trainer.train()

# Cell 12: Evaluate
trainer.evaluate()

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

ArrowInvalid: Column 6 named input_ids expected length 400 but got length 800

In [None]:

# Load the MultiRC dataset.
multirc_dataset = load_dataset("super_glue", "multirc")

def preprocess_multirc(example):
    # For demonstration, we combine the passage and the question, and treat each answer candidate separately.
    # Note: MultiRC is naturally a multi-label task; here we use a binary formulation per candidate.
    inputs = tokenizer(example["paragraph"], example["question"], truncation=True)
    # Assume each answer candidate has a label (1 if correct, 0 otherwise) stored in "answers"
    # Here we simply take the first answer candidate and its label as an example.
    # In practice, you would iterate over all candidates.
    inputs["labels"] = example["answers"]["label"][0] if isinstance(example["answers"]["label"], list) else example["answers"]["label"]
    return inputs

multirc_encoded = multirc_dataset.map(preprocess_multirc, batched=True)

# Load the classification model with 2 labels.
multirc_model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

multirc_training_args = TrainingArguments(
    output_dir="./multirc_roberta_large",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=50,
)

multirc_trainer = Trainer(
    model=multirc_model,
    args=multirc_training_args,
    train_dataset=multirc_encoded["train"],
    eval_dataset=multirc_encoded["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_accuracy,
)

# To train the model, uncomment the following line:
# multirc_trainer.train()


In [None]:

# Load the ReCoRD dataset.
record_dataset = load_dataset("super_glue", "record")

def preprocess_record(examples):
    # For QA tasks: combine the passage and query; the answer span is provided.
    # Note: The format of ReCoRD requires a custom processing to extract contexts, queries, and answer spans.
    inputs = tokenizer(examples["passage"], examples["query"], truncation=True, max_length=512)
    inputs["start_positions"] = examples.get("answer", {}).get("span_start", 0)
    inputs["end_positions"] = examples.get("answer", {}).get("span_end", 0)
    return inputs

record_encoded = record_dataset.map(preprocess_record, batched=True, remove_columns=record_dataset["train"].column_names)

# Load a RoBERTa model for question answering.
record_model = RobertaForQuestionAnswering.from_pretrained(MODEL_NAME)

record_training_args = TrainingArguments(
    output_dir="./record_roberta_large",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=50,
)

# For QA tasks, you may want to use a custom compute_metrics function (e.g., F1 and Exact Match).
# For demonstration, we leave compute_metrics as None.
record_trainer = Trainer(
    model=record_model,
    args=record_training_args,
    train_dataset=record_encoded["train"],
    eval_dataset=record_encoded["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=None,   # Replace with your metric function if needed.
)

# To train the model, uncomment the following line:
# record_trainer.train()
