# Use Trainer API to fine-tune GPT-J model

## OpenBookQA Dataset

In [3]:
from datasets import load_dataset

openbookqa_dataset = load_dataset("openbookqa")

### Pre-processing

In [4]:
def preprocess_examples(example):
    # merge question and choices
    choices_text = example['choices']['text']
    question_stems = example['question_stem']
    inputs = [question_stems + " " + choice for choice in choices_text]
    # answerKey to text label
    label = ord(example['answerKey']) - ord('A')
    return {'input_texts': inputs, 'label': label}


preprocessed_dataset = openbookqa_dataset.map(preprocess_examples)

print(preprocessed_dataset)
print(preprocessed_dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey', 'input_texts', 'label'],
        num_rows: 4957
    })
    validation: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey', 'input_texts', 'label'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey', 'input_texts', 'label'],
        num_rows: 500
    })
})
{'id': '7-980', 'question_stem': 'The sun is responsible for', 'choices': {'text': ['puppies learning new tricks', 'children growing up and getting old', 'flowers wilting in a vase', 'plants sprouting, blooming and wilting'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'D', 'input_texts': ['The sun is responsible for puppies learning new tricks', 'The sun is responsible for children growing up and getting old', 'The sun is responsible for flowers wilting in a vase', 'The sun is responsible for plants sprouting, blooming and wilting'], 'label': 3}


### Tokenization

In [27]:
import torch
from transformers import AutoTokenizer, GPTJForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gptj")
tokenizer = AutoTokenizer.from_pretrained("ydshieh/tiny-random-gptj-for-sequence-classification")
model = GPTJForSequenceClassification.from_pretrained("ydshieh/tiny-random-gptj-for-sequence-classification")

# Check if the tokenizer has a pad token, if not, set it to the EOS token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # Ensure the model adjusts to the new tokenizer
    model.resize_token_embeddings(len(tokenizer))


In [30]:
def tokenize_examples(examples):
    # Tokenize each input text separately
    tokenized_inputs = tokenizer(examples['input_texts'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    
    # Ensure the label is correctly included
    # Note: Adjust according to how your dataset is structured; this assumes labels are directly accessible
    tokenized_inputs['labels'] = examples['label']
    
    return tokenized_inputs

# Assuming `preprocessed_dataset` is correctly prepared as per your previous step
tokenized_dataset = preprocessed_dataset.map(tokenize_examples)

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [29]:
print(tokenized_dataset['train'][0])

{'id': '7-980', 'question_stem': 'The sun is responsible for', 'choices': {'text': ['puppies learning new tricks', 'children growing up and getting old', 'flowers wilting in a vase', 'plants sprouting, blooming and wilting'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'D', 'input_texts': ['The sun is responsible for puppies learning new tricks', 'The sun is responsible for children growing up and getting old', 'The sun is responsible for flowers wilting in a vase', 'The sun is responsible for plants sprouting, blooming and wilting'], 'label': 3, 'input_ids': [[52, 195, 203, 302, 301, 484, 80, 675, 551, 234, 274, 224, 85, 354, 346, 409, 217, 78, 223, 664, 194, 523, 75, 83, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,

### Causal LM

In [26]:
# from transformers import AutoModelForCausalLM, GPTJModel

# # model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
# model = GPTJModel.from_pretrained("hf-internal-testing/tiny-random-gptj")

tokenizer_config.json:   0%|          | 0.00/225 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

In [31]:
from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=10,
#     do_train=True,
#     do_eval=True,
# )

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

In [33]:
trainer.train()

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [2], [2, 4]

# Use Trainer API to fine-tune RWKV model