In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import DataLoader
import torch
import json
from datasets import Dataset

# Load the pre-trained tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Assign the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Resize the token embeddings if a new token is added (only needed if you added new tokens)
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the file paths for the training and test datasets
train_file_path = './train-v1.1.json'
test_file_path = './dev-v1.1.json'

# Load the training and test datasets
def load_squad_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

train_data = load_squad_data(train_file_path)
test_data = load_squad_data(test_file_path)

# Prepare training and test data
def prepare_data(squad_data):
    contexts, questions, answers = [], [], []
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text']  # Using the first answer for simplicity
                contexts.append(context)
                questions.append(question)
                answers.append(answer)
    return contexts, questions, answers

# Prepare the train and test datasets
train_contexts, train_questions, train_answers = prepare_data(train_data)
test_contexts, test_questions, test_answers = prepare_data(test_data)

print("data is prepped")






data is prepped


In [2]:
# Create a list of formatted training examples
train_formatted_data = []
for context, question, answer in zip(train_contexts, train_questions, train_answers):
    train_formatted_data.append(f"Context: {context}\nQuestion: {question}\nAnswer: {answer}")

# Create a list of formatted test examples
test_formatted_data = []
for context, question, answer in zip(test_contexts, test_questions, test_answers):
    test_formatted_data.append(f"Context: {context}\nQuestion: {question}\nAnswer: {answer}")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({"text": train_formatted_data})
test_dataset = Dataset.from_dict({"text": test_formatted_data})

# Tokenize the dataset and set 'labels' for loss computation
def tokenize_function(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=384,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    encoding["input_ids"] = encoding["input_ids"].to(device)
    encoding["attention_mask"] = encoding["attention_mask"].to(device)
    encoding["labels"] = encoding["input_ids"].clone()
    return encoding

# Apply tokenization to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Enable evaluation after every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,  # Add this to control evaluation batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,  # Include evaluation dataset
    tokenizer=tokenizer
)

# Start training
trainer.train()

# Save the model and tokenizer after training
model.save_pretrained('./gpt2-qa')
tokenizer.save_pretrained('./gpt2-qa')

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,1.412,1.464174
2,1.1998,1.47564
3,1.2574,1.480558


('./gpt2-qa\\tokenizer_config.json',
 './gpt2-qa\\special_tokens_map.json',
 './gpt2-qa\\vocab.json',
 './gpt2-qa\\merges.txt',
 './gpt2-qa\\added_tokens.json')

In [22]:
# Install necessary libraries
!pip install transformers --quiet

# Import libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import math

# Load the fine-tuned tokenizer and model using raw strings
tokenizer = GPT2Tokenizer.from_pretrained(r'C:\Users\sassy\Downloads\Final Project\gpt2-qa')
model = GPT2LMHeadModel.from_pretrained(r'C:\Users\sassy\Downloads\Final Project\gpt2-qa')

# Ensure the model is in evaluation mode
model.eval()

# Define the response generation function with response length limitation
def generate_response(conversation_history, max_length=1000):
    # Join the conversation history into a single string
    prompt = '\n'.join(conversation_history) + '\nBot:'
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=1024)
    
    # Generate a response with a limited max length
    output = model.generate(
        input_ids=inputs['input_ids'],  # Correct access to input_ids
        attention_mask=inputs['attention_mask'],
        max_length=inputs['input_ids'].shape[1] + 50,  # Limit response length to avoid large outputs
        pad_token_id=tokenizer.pad_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_p=0.9,
        temperature=0.8
    )
    
    # Decode and extract the bot's response
    response_text = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_text = response_text[len(prompt):].strip()
    for stop_token in ['\nUser:', '\nBot:']:
        generated_text = generated_text.split(stop_token)[0]
    return generated_text.strip()

In [24]:
# Function to truncate conversation history if too long
def truncate_conversation(conversation_history, max_length=1024):
    prompt = '\n'.join(conversation_history) + '\nBot:'
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')
    total_length = tokenized_prompt.input_ids.shape[1]
    while total_length > max_length and len(conversation_history) > 1:
        conversation_history = conversation_history[2:]  # Remove oldest user and bot turn
        tokenized_prompt = tokenizer('\n'.join(conversation_history) + '\nBot:', return_tensors='pt')
        total_length = tokenized_prompt.input_ids.shape[1]
    return conversation_history

# Function to compute log-transformed perplexity to avoid large jumps
def compute_perplexity(model, tokenizer, text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt')
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
    return math.log(perplexity + 1)  # Use log scale for perplexity

# Initialize conversation history
conversation_history = []

In [26]:
print("Welcome to the Chatbot! Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot session ended.")
        break
    
    # Add user input to conversation history
    conversation_history.append(f"User: {user_input}")
    
    # Truncate conversation history if necessary
    conversation_history = truncate_conversation(conversation_history, max_length=1024)
    
    # Generate bot response
    bot_response = generate_response(conversation_history)
    
    # Add bot response to conversation history
    conversation_history.append(f"Bot: {bot_response}")
    
    # Print bot response
    print(f"Bot: {bot_response}")
    
    # Compute Perplexity for the bot response (log-transformed)
    perplexity = compute_perplexity(model, tokenizer, bot_response)
    
    # Print Perplexity
    print(f"\nPerplexity Score (log scale): {perplexity:.2f}")
    print("-" * 50)

Welcome to the Chatbot! Type 'exit' to quit.


You:  What are the average grades of differnt students


Bot: B - Average grade in the computer science program.
B: C - Average GPA in the engineering program.

Perplexity Score (log scale): 5.18
--------------------------------------------------


KeyboardInterrupt: Interrupted by user