In [None]:
!pip install -qU datasets fsspec transformers evaluate

In [None]:
import wandb
import os 
# Insert your WANDB key
wandb.login(key='')
os.environ["WANDB_PROJECT"] = "NLP"
os.environ["WANDB_LOG_MODEL"] = "end"

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, pipeline
from evaluate import evaluator
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# CHANGE MODEL_NAME
MODEL_NAME = "FacebookAI/roberta-base"
# "google/electra-base-discriminator", "FacebookAI/roberta-base", "SpanBERT/spanbert-base-cased"
EPOCHS = 5

In [None]:
trainval_squad = load_dataset("squad", split="train")
test_squad = load_dataset("squad", split="validation")

In [None]:
trainval_squad = trainval_squad.train_test_split(test_size=0.2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [None]:
def get_token_length(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding=False,
        return_length=True,
    )

    return inputs

token_length = trainval_squad.map(get_token_length, batched=True, remove_columns=trainval_squad['train'].column_names, num_proc=4)
lengths = token_length['train']['length']

plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=30, color="skyblue", edgecolor="black")
plt.title("Distribution of Token Lengths (SQuAD)")
plt.xlabel("Number of Tokens")
plt.ylabel("Number of Samples")
plt.grid(True)
plt.show()

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = trainval_squad.map(preprocess_function, batched=True, remove_columns=trainval_squad["train"].column_names, num_proc=4)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
training_args = TrainingArguments(
    dataloader_num_workers = 4,
    output_dir="best_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=500,
    learning_rate=2e-5,
    lr_scheduler_type='cosine_with_min_lr',
    lr_scheduler_kwargs={'min_lr_rate':0.1},
    warmup_ratio=0.1,
    fp16=True,
    max_grad_norm = 1.0,
    optim="adamw_torch",
    gradient_accumulation_steps=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
#    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")

In [None]:
from datasets import load_dataset
from evaluate import evaluator
from transformers import AutoModelForSequenceClassification, pipeline

task_evaluator = evaluator("question-answering")

eval_results = task_evaluator.compute(
    model_or_pipeline=model,
    data=trainval_squad["test"], # CHANGE TO test_squad if doing testing
    tokenizer=tokenizer,
    metric="squad",
    strategy="simple",
)

print(eval_results)

In [None]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("best_model")
inputs = tokenizer(question, context, return_tensors="pt")

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("best_model")
model.eval()

with torch.no_grad():
    outputs = model(**inputs)

In [None]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

In [None]:
wandb.finish()