In [15]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from transformers import DataCollatorForLanguageModeling
import numpy as np

# Load fine-tuned GPT2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("/Users/jameslim/Downloads/dataset/GPT2_tokenizer")
model1 = GPT2LMHeadModel.from_pretrained("/Users/jameslim/Downloads/dataset/GPT2_finetuned_model_setting1")
model2 = GPT2LMHeadModel.from_pretrained("/Users/jameslim/Downloads/dataset/GPT2_finetuned_model_setting2")

tokenizer3 = GPT2Tokenizer.from_pretrained("gpt2")
model3 = GPT2LMHeadModel.from_pretrained('gpt2')

# # Function to generate text
# def generate_text(prompt, model, tokenizer, max_length=200):
#     input_ids = tokenizer.encode(prompt, return_tensors="pt")
#     # Generate text
#     output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
#     # Decode and return the generated text
#     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
#     return generated_text

def generate_text(prompt, model, tokenizer, max_length=200, max_new_tokens=50):
    # Ensure the tokenizer's pad token is set to EOS token
    tokenizer.pad_token = tokenizer.eos_token

    # Encode the prompts to input ids and attention masks
    encoding = tokenizer.encode_plus(prompt, return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Generate text with attention mask and specify max_new_tokens instead of max_length for generation control
    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, num_return_sequences=1)

    # Decode and return the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


# Generate text using the fine-tuned model
prompt = "What's the capital city of France?"
generated_text = generate_text(prompt, model3, tokenizer3)
print(generated_text)

# Reference text (actual answer)
reference_text = "Paris"

# Compute Perplexity
input_ids = tokenizer.encode(generated_text, return_tensors="pt")
with torch.no_grad():
    outputs = model(input_ids=input_ids)
    loss = outputs.loss
perplexity = np.exp(loss.item())

# Compute BLEU Score
bleu_score = corpus_bleu([[reference_text.split()]], [generated_text.split()])

# Compute ROUGE Score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference_text, generated_text)
rouge_score = {key: score.fmeasure for key, score in scores.items()}

# Compute METEOR Score
meteor_score_val = meteor_score([reference_text], generated_text)

# Print the computed metrics
print("Perplexity:", perplexity)
print("BLEU Score:", bleu_score)
print("ROUGE Score:", rouge_score)
print("METEOR Score:", meteor_score_val)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What's the capital city of France?

France is the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It's the capital of France. It


In [4]:
generated_text

'What is the capital of France?. The company is a leading global producer of automotive seating and other innovative technologies. The company is a leading global producer of automotive seating and other innovative technologies. The company is a leading global producer of automotive seating and other innovative technologies. The company is a leading global producer of automotive seating and other innovative technologies. Show more Show less. The hiring company is ecocareers. The job is located at London, England, United Kingdom in country United Kingdom. This job posting comes from CS5344GROUP08LINKEDIN dataset. The job is located at London, England, United Kingdom in country United Kingdom. Here is the job summary. We are looking for a capital of France? to join our team! We are looking for a capital of France? to join our team! We are looking for a capital of France? to join our team! Show less. The hiring company is ecocareers. The job is located at London, England, United'