<a href="https://colab.research.google.com/github/YunshuoTian/nlp/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np

In [None]:
# load a dataset from hugginface
dataset = load_dataset("akoksal/LongForm")

In [None]:
# how the data look like
dataset['train'][101]

In [None]:
# load tokenizer and model
model_name = 'bert-base-uncased'
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)

In [None]:
# start preprocessing
def preprocess_data(examples):
    # Tokenize the inputs (e.g., context or question)
    tokenized_examples = tokenizer(
        examples['input'],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_examples['offset_mapping']):
        input_ids = tokenized_examples['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Find the answer in the input text
        input_text = examples['input'][i]
        answer_text = examples['output'][i]

        start_char = input_text.find(answer_text)
        if start_char == -1:  # Answer not found in the input
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        end_char = start_char + len(answer_text)

        # Find the start and end token indices
        token_start_index = 0
        token_end_index = 0

        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                token_start_index = idx
            if start < end_char <= end:
                token_end_index = idx
                break

        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    tokenized_examples['start_positions'] = start_positions
    tokenized_examples['end_positions'] = end_positions

    # Remove the offset mapping since we don't need it anymore
    tokenized_examples.pop("offset_mapping")

    return tokenized_examples

In [None]:
# get the first 1000 data in dataset for training

train_dataset = dataset["train"].select(range(1000))
eval_dataset = dataset["validation"]
tokenized_trainset = train_dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
tokenized_evalset = eval_dataset.map(preprocess_data, batched=True, remove_columns=dataset["validation"].column_names)

In [None]:
# define training args
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_evalset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()