In [30]:
# Import necessary libraries for GPT-2 with Dataset json files
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import DataLoader
import torch
import json
from datasets import Dataset

# Load the pre-trained tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Assign the padding token to the EOS (End of Sequence) token
tokenizer.pad_token = tokenizer.eos_token

# Resize the token embeddings if a new token is added
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the file paths for the training and test datasets (train and dev respectively)
train_file_path = './train-v1.1.json'
test_file_path = './dev-v1.1.json'

# Load datasets
def load_squad_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

train_data = load_squad_data(train_file_path)
test_data = load_squad_data(test_file_path)

# Prepare training and test data
def prepare_data(squad_data):
    contexts, questions, answers = [], [], []
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                
                #first answer display
                answer = qa['answers'][0]['text']
                contexts.append(context)
                questions.append(question)
                answers.append(answer)
    return contexts, questions, answers

# Prepare datasets
train_contexts, train_questions, train_answers = prepare_data(train_data)
test_contexts, test_questions, test_answers = prepare_data(test_data)

#print for proof of completion
print("data is prepped")

data is prepped


In [2]:
# Create a list of formatted training examples
train_formatted_data = []
for context, question, answer in zip(train_contexts, train_questions, train_answers):
    train_formatted_data.append(f"Context: {context}\nQuestion: {question}\nAnswer: {answer}")

# Create a list of formatted test examples
test_formatted_data = []
for context, question, answer in zip(test_contexts, test_questions, test_answers):
    test_formatted_data.append(f"Context: {context}\nQuestion: {question}\nAnswer: {answer}")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({"text": train_formatted_data})
test_dataset = Dataset.from_dict({"text": test_formatted_data})

# Tokenize the dataset and set 'labels' for loss computation
def tokenize_function(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=384,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    encoding["input_ids"] = encoding["input_ids"].to(device)
    encoding["attention_mask"] = encoding["attention_mask"].to(device)
    encoding["labels"] = encoding["input_ids"].clone()
    return encoding

# Apply tokenization to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer
)

# Training
trainer.train()

# Save the model and tokenizer after training (files show up in gpt2-qa folder for use)
model.save_pretrained('./gpt2-qa')
tokenizer.save_pretrained('./gpt2-qa')

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,1.412,1.464174
2,1.1998,1.47564
3,1.2574,1.480558


('./gpt2-qa\\tokenizer_config.json',
 './gpt2-qa\\special_tokens_map.json',
 './gpt2-qa\\vocab.json',
 './gpt2-qa\\merges.txt',
 './gpt2-qa\\added_tokens.json')