In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import numpy as np
import pandas as pd
import torch
import os

In [7]:
current_dir = os.getcwd()
tokenizer = GPT2Tokenizer.from_pretrained(current_dir + "/baseline-gpt")
model = GPT2LMHeadModel.from_pretrained(current_dir + "/baseline-gpt")

In [8]:
# Calculates perplexity, a lower perplexity implies a higher probability and vice versa
def score(tokens_tensor):
    loss=model(tokens_tensor, labels=tokens_tensor).loss
    return loss.item()
#     return np.exp(loss.cpu().detach().numpy())

# Perplexity Calculation Example

texts = ['james bond is a very bad movie', 'james bond is a very good movie', 'i hated james bond', 'i loved james bond']
for text in texts:
    tokens_tensor = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")           
    print(text, score(tokens_tensor))

james bond is a very bad movie 6.239948272705078
james bond is a very good movie 5.853633880615234
i hated james bond 9.393083572387695
i loved james bond 8.7401123046875


In [9]:
# Top K Sampling Example

input_id = tokenizer.encode("james bond", add_special_tokens=False, return_tensors="pt")

sample_output = model.generate(
    input_id, 
    do_sample=True, 
    max_length=50, 
    top_k=10
)

print("Output:\n" + 100 * '-')
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
james bond is not considered as a major part of the overall story. 
 In The Simpsons Movie : Bart vs. the Space Mutants, Homer and Lisa have a romantic relationship while the episode's events are narrated by the voice of David


In [10]:
sentence_df = pd.read_csv(current_dir + "/Pos-Neg-Bond-Sentences.csv")
sentence_df.head()

Unnamed: 0,Positive,Negative
0,james bond is a very good movie,james bond is a very bad movie
1,i loved james bond,i hated james bond
2,james bond respects people,james bond takes advantage of people
3,bond is great,bond is terrible
4,james bond is a symbol of bravery,james bond is a symbol of British Colonialism


In [11]:
pos_perplexities = []
for sentence in sentence_df["Positive"].values:
    tokens_tensor = tokenizer.encode(sentence, add_special_tokens=False, return_tensors="pt") 
    pos_perplexities.append(score(tokens_tensor))
    
sentence_df["Positive Perplexities"] = pos_perplexities

neg_perplexities = []
for sentence in sentence_df["Negative"].values:
    tokens_tensor = tokenizer.encode(sentence, add_special_tokens=False, return_tensors="pt") 
    neg_perplexities.append(score(tokens_tensor))
    
sentence_df["Negative Perplexities"] = neg_perplexities

sentence_df.head()

Unnamed: 0,Positive,Negative,Positive Perplexities,Negative Perplexities
0,james bond is a very good movie,james bond is a very bad movie,5.853634,6.239948
1,i loved james bond,i hated james bond,8.740112,9.393084
2,james bond respects people,james bond takes advantage of people,9.649253,7.036253
3,bond is great,bond is terrible,6.017255,6.852209
4,james bond is a symbol of bravery,james bond is a symbol of British Colonialism,5.663447,5.030104
