In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    StoppingCriteriaList,
    MaxLengthCriteria,
    AutoModelForSeq2SeqLM,
)
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

# prompt = "Today I believe we can finally"
prompt = "It might be possible to"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Greedy Search TASK 1

outputs = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30,return_dict_in_generate=True, output_scores=True)
transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
print(tokenizer.batch_decode(outputs[0], skip_special_tokens=True))

input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]

perplexity = 0
likelihood = 0

for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | logits | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.detach().numpy():.3f} | {np.exp(score.detach().numpy()):.2%}")
    likelihood += np.log(-1 * score.detach().numpy())
    perplexity += np.log(np.exp(score.detach().numpy()))

print(f"Length of the output: {generated_tokens.shape[1]}")
print(f"Perplexity: {np.exp((-1/ generated_tokens.shape[1]) * perplexity)}")
print(f"likelihood: {likelihood}")



["It might be possible to get a better understanding of the nature of the problem, but it's not clear how to do that.\n\nThe problem"]
|   651 |  get     | -3.341 | 3.54%
|   257 |  a       | -1.936 | 14.43%
|  1365 |  better  | -3.269 | 3.80%
|  4547 |  understanding | -1.486 | 22.63%
|   286 |  of      | -0.151 | 86.03%
|   262 |  the     | -1.247 | 28.73%
|  3450 |  nature  | -4.258 | 1.41%
|   286 |  of      | -0.083 | 92.02%
|   262 |  the     | -1.289 | 27.57%
|  1917 |  problem | -4.050 | 1.74%
|    11 | ,        | -1.583 | 20.54%
|   475 |  but     | -0.740 | 47.72%
|   340 |  it      | -1.707 | 18.14%
|   338 | 's       | -1.193 | 30.33%
|   407 |  not     | -1.242 | 28.88%
|  1598 |  clear   | -2.580 | 7.58%
|   703 |  how     | -1.511 | 22.07%
|   284 |  to      | -1.713 | 18.03%
|   466 |  do      | -1.116 | 32.77%
|   326 |  that    | -1.147 | 31.77%
|    13 | .        | -0.663 | 51.54%
|   198 | 
        | -1.139 | 32.01%
|   198 | 
        | -0.002 | 99.84%
|   464 | The

  beam_indices[beam_indices_mask] = 0
  beam_indices[beam_indices_mask] = 0


In [3]:
#Beam Search TASK 1

outputs = model.generate(
    input_ids,
    num_beams=3,
    early_stopping=True,
    max_length=30,
    return_dict_in_generate=True,
    output_scores=True,
)
transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
)
# If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
# Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
# use case, you might want to recompute it with `normalize_logits=True`.
output_length = input_length + np.sum(transition_scores.numpy() < 0, axis=1)
length_penalty = model.generation_config.length_penalty
reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)

print(tokenizer.batch_decode(outputs[0], skip_special_tokens=True))

# input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]

perplexity = 0
likelihood = 0

for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | logits | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.detach().numpy():.3f} | {np.exp(score.detach().numpy()):.2%}")
    likelihood += np.log(-1 * score.detach().numpy())
    perplexity += np.log(np.exp(score.detach().numpy()))

print(f"Length of the output: {generated_tokens.shape[1]}")
print(f"Perplexity: {np.exp((-1/ generated_tokens.shape[1]) * perplexity)}")
print(f"likelihood: {likelihood}")

["It might be possible to get a better understanding of how the system works, but it's not going to be easy.\n\nIn the meantime,"]
|   651 |  get     | -3.341 | 3.54%
|   257 |  a       | -1.936 | 14.43%
|  1365 |  better  | -3.269 | 3.80%
|  4547 |  understanding | -1.486 | 22.63%
|   286 |  of      | -0.151 | 86.02%
|   703 |  how     | -1.795 | 16.61%
|   262 |  the     | -1.791 | 16.67%
|  1080 |  system  | -4.110 | 1.64%
|  2499 |  works   | -0.338 | 71.32%
|    11 | ,        | -1.423 | 24.09%
|   475 |  but     | -0.623 | 53.65%
|   340 |  it      | -1.656 | 19.08%
|   338 | 's       | -0.925 | 39.67%
|   407 |  not     | -1.277 | 27.90%
|  1016 |  going   | -2.657 | 7.02%
|   284 |  to      | -0.006 | 99.38%
|   307 |  be      | -1.001 | 36.75%
|  2562 |  easy    | -1.028 | 35.76%
|    13 | .        | -0.485 | 61.58%
|   198 | 
        | -1.203 | 30.02%
|   198 | 
        | -0.001 | 99.91%
|   818 | In       | -3.445 | 3.19%
|   262 |  the     | -1.601 | 20.17%
| 14324 |  meanti

In [4]:
#Top-K Sampling TASK 1

outputs = model.generate(input_ids, do_sample=True, max_length=30, return_dict_in_generate=True, output_scores=True, top_k=50)

print(tokenizer.batch_decode(outputs[0], skip_special_tokens=True))

input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]

transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)

perplexity = 0
likelihood = 0

for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | logits | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.detach().numpy():.3f} | {np.exp(score.detach().numpy()):.2%}")
    likelihood += np.log(-1 * score.detach().numpy())
    perplexity += np.log(np.exp(score.detach().numpy()))

print(f"Length of the output: {generated_tokens.shape[1]}")
print(f"Perplexity: {np.exp((-1/ generated_tokens.shape[1]) * perplexity)}")
print(f"likelihood: {likelihood}")

["It might be possible to make them fit the bill.\n\nI don't know if you are familiar with the concept of a machine as part of"]
|   787 |  make    | -2.534 | 7.93%
|   606 |  them    | -4.176 | 1.54%
|  4197 |  fit     | -4.550 | 1.06%
|   262 |  the     | -2.868 | 5.68%
|  2855 |  bill    | -2.032 | 13.10%
|    13 | .        | -1.738 | 17.58%
|   198 | 
        | -1.354 | 25.81%
|   198 | 
        | -0.001 | 99.94%
|    40 | I        | -4.203 | 1.50%
|   836 |  don     | -2.558 | 7.74%
|   470 | 't       | -0.001 | 99.95%
|   760 |  know    | -1.129 | 32.34%
|   611 |  if      | -1.584 | 20.51%
|   345 |  you     | -2.672 | 6.91%
|   389 |  are     | -3.962 | 1.90%
|  5385 |  familiar | -1.937 | 14.42%
|   351 |  with    | -0.018 | 98.25%
|   262 |  the     | -0.682 | 50.56%
|  3721 |  concept | -2.215 | 10.91%
|   286 |  of      | -0.337 | 71.42%
|   257 |  a       | -1.128 | 32.36%
|  4572 |  machine | -4.832 | 0.80%
|   355 |  as      | -4.114 | 1.63%
|   636 |  part    | -4.549 |

In [5]:
#Top-P Sampling TASK 1 
#Need to figure out a good value for top_p
#top_p = 4 gave good values but it's supposed to be bounded (0,1)

outputs = model.generate(input_ids, top_p = 0.92, top_k=0, do_sample=True, max_length=30, return_dict_in_generate=True, output_scores=True)
print(tokenizer.batch_decode(outputs[0], skip_special_tokens=True))

input_length = 1 if model.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]

transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)

perplexity = 0
likelihood = 0

for tok, score in zip(generated_tokens[0], transition_scores[0]):
    # | token | token string | logits | probability
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.detach().numpy():.3f} | {np.exp(score.detach().numpy()):.2%}")
    likelihood += np.log(-1 * score.detach().numpy())
    perplexity += np.log(np.exp(score.detach().numpy()))

print(f"Length of the output: {generated_tokens.shape[1]}")
print(f"Perplexity: {np.exp((-1/ generated_tokens.shape[1]) * perplexity)}")
print(f"likelihood: {likelihood}")

['It might be possible to explain it by having servants work on the code.\n\nWhy does it take 2 hours to code a Feather4U in']
|  4727 |  explain | -5.907 | 0.27%
|   340 |  it      | -3.166 | 4.22%
|   416 |  by      | -1.353 | 25.86%
|  1719 |  having  | -4.316 | 1.34%
| 17523 |  servants | -9.796 | 0.01%
|   670 |  work    | -4.432 | 1.19%
|   319 |  on      | -1.751 | 17.36%
|   262 |  the     | -1.252 | 28.59%
|  2438 |  code    | -6.577 | 0.14%
|    13 | .        | -1.566 | 20.90%
|   198 | 
        | -1.844 | 15.82%
|   198 | 
        | 0.000 | 100.00%
|  5195 | Why      | -4.929 | 0.72%
|   857 |  does    | -2.846 | 5.81%
|   340 |  it      | -1.883 | 15.21%
|  1011 |  take    | -2.318 | 9.85%
|   362 |  2       | -5.077 | 0.62%
|  2250 |  hours   | -1.571 | 20.79%
|   284 |  to      | -0.645 | 52.46%
|  2438 |  code    | -4.366 | 1.27%
|   257 |  a       | -1.374 | 25.31%
| 34501 |  Feather | -9.831 | 0.01%
|    19 | 4        | -8.790 | 0.02%
|    52 | U        | -4.515 | 1.09

  likelihood += np.log(-1 * score.detach().numpy())


**************************************************
Task 2 starts here

In [6]:
#TASK 2
#Load the dataset

from datasets import load_dataset
import torch
dataset = load_dataset("cnn_dailymail", "3.0.0")

max_length = 20

Found cached dataset cnn_dailymail (C:/Users/zebzi/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:00<00:00, 10.57it/s]


In [7]:
#TASK 2 (switched from a downstream BERT because things were failing)
from evaluate import load
bertscore = load("bertscore")

tokenizer2 = AutoTokenizer.from_pretrained("t5-base")
model2 = AutoModelForSeq2SeqLM.from_pretrained("t5-base", pad_token_id=tokenizer.eos_token_id)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
# input_ids = dataset["test"][:50]
tokenizerInputs =  []
inputGroundTruths = []
for i in range(50):
    tokenizerInputs.append(dataset["test"][i]["article"])
    inputGroundTruths.append(dataset["test"][i]["highlights"])
print(tokenizerInputs[0])

encoder_input_ids = torch.LongTensor()

for i in range(50):
    encoder_input_ids = torch.cat((encoder_input_ids, tokenizer2(tokenizerInputs[i], return_tensors="pt", padding='max_length', truncation=True).input_ids))
print(tokenizer2.decode(encoder_input_ids[0]))

(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, sa

In [9]:
#Task 2 
outputs2 = []
#Greedy Search
outputs2.append(model2.generate(encoder_input_ids, num_beams=1, do_sample=False, max_length=max_length))



In [10]:
#Beam Search
outputs2.append(model2.generate(encoder_input_ids, num_beams=3, early_stopping=True, max_length=max_length))

In [11]:
#Top-K Sampling
outputs2.append(model2.generate(encoder_input_ids, do_sample=True, top_k=50, max_length=max_length))

#max_length = 50 with top_k>=40 gave the "index out of range in self" error

In [12]:
#Top-P Sampling
outputs2.append(model2.generate(encoder_input_ids, do_sample=True, top_p=0.92, top_k=0, max_length=max_length)) 

#max_length = 30 with top_p>=0.8 gave the "index out of range in self" error
#max_length = 50 with top_p>=0.4 gave the "index out of range in self" error

In [13]:
print(tokenizer2.batch_decode(outputs2[0], skip_special_tokens=True)[0])
print(tokenizer2.batch_decode(outputs2[1], skip_special_tokens=True)[0])
print(tokenizer2.batch_decode(outputs2[2], skip_special_tokens=True)[0])
print(tokenizer2.batch_decode(outputs2[3], skip_special_tokens=True)[0])

the Palestinians became the 123rd member of the international criminal court on Wednesday 
the Palestinian Authority officially became the 123rd member of the international criminal court on Wednesday
the transition to international law took place at the Hague on Wednesday. a move
it "brings us closer to our shared goals of justice and peace" formally


In [15]:
references = []
predictions = []
results = []

with open("generatedText6.txt", "w") as f:
    for i in range(len(outputs2[0])):
        references.append([inputGroundTruths[i], inputGroundTruths[i], inputGroundTruths[i], inputGroundTruths[i]])
        predictions.append([tokenizer2.batch_decode(outputs2[0], skip_special_tokens=True)[i], tokenizer2.batch_decode(outputs2[1], skip_special_tokens=True)[i], tokenizer2.batch_decode(outputs2[2], skip_special_tokens=True)[i], tokenizer2.batch_decode(outputs2[3], skip_special_tokens=True)[i]])
        results.append([bertscore.compute(predictions=predictions[i], references=references[i], lang="en")])

        f.write(f"Ground Truth: {inputGroundTruths[i]} \n")
        f.write(f"Greedy Search: {tokenizer2.batch_decode(outputs2[0], skip_special_tokens=True)[i]} \n")
        f.write(f"Beam Search: {tokenizer2.batch_decode(outputs2[1], skip_special_tokens=True)[i]} \n")
        f.write(f"Top-K Sampling: {tokenizer2.batch_decode(outputs2[2], skip_special_tokens=True)[i]} \n")
        f.write(f"Top-P Sampling: {tokenizer2.batch_decode(outputs2[3], skip_special_tokens=True)[i]} \n")
        f.write("\n \n")

#generatedText is with no capped max_length so it defaulted to 20 I think top_p=0.92 and top_k=50
#generatedText2 is with max_length = 30
#generatedText3 is also max_length = 30 but top_p=0.7 ~ 6mins of runtime
#generatedText4 is at max_length = 40 with top_p=0.7 ~5mins of runtime thanks to CPU acceleration
#generatedText5 is at max_length = 50 with top_p=0.3 and top_k=25 ~Xmins of runtime but model_max_length = 1024

Downloading (…)lve/main/config.json: 100%|██████████| 482/482 [00:00<00:00, 284kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:01<00:00, 503kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 487kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 1.43G/1.43G [13:01<00:00, 1.82MB/s]


In [20]:
for i in range(10):
    print(results[i])

[{'precision': [0.8502079248428345, 0.8599529266357422, 0.8599265217781067, 0.8428065180778503], 'recall': [0.8301665782928467, 0.8464600443840027, 0.8243862986564636, 0.8299463391304016], 'f1': [0.8400676846504211, 0.8531530499458313, 0.8417814373970032, 0.8363269567489624], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.26.1)'}]
[{'precision': [0.8497933149337769, 0.8756840825080872, 0.8550580143928528, 0.849199652671814], 'recall': [0.8177963495254517, 0.832705020904541, 0.8195192813873291, 0.8176928758621216], 'f1': [0.8334878087043762, 0.8536539673805237, 0.8369115591049194, 0.8331485390663147], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.26.1)'}]
[{'precision': [0.8543307185173035, 0.8487553596496582, 0.8374242782592773, 0.8458214402198792], 'recall': [0.8133378624916077, 0.830542802810669, 0.8214253187179565, 0.8297159671783447], 'f1': [0.8333304524421692, 0.8395503759384155, 0.8293476700782776, 0.8376913070678711], 'hashcode': 'roberta-lar