In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration
from transformers.models.auto.processing_auto import AutoTokenizer
import pandas as pd
import time
import os
from google.colab import drive
drive.mount('/content/drive')
import torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class QuestionDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=30):
        self.path = os.path.join(data_dir, type_path + '.csv')
        self.ans = 'answers'
        self.inp = 'context'
        self.out = 'question'
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            answer_text, input_text, output_text = self.data.loc[idx,self.ans], self.data.loc[idx, self.inp],self.data.loc[idx, self.out]
            input_ = "answer: %s  context: %s" % (answer_text, input_text)
            target = "%s" % (output_text)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=200, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=20, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
dataset_val = QuestionDataset(tokenizer, '/content/drive/MyDrive/AIN311Project/data/QuestionG', 'valid',  max_len=400)
loader = DataLoader(dataset_val, batch_size=32, shuffle=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [5]:
it = iter(loader)

In [6]:
batch = next(it)
batch["source_ids"].shape

torch.Size([32, 200])

In [7]:
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/AIN311Project/Models/t5-base-question-generation_custom_20")


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [9]:
def greedy_decoding (inp_ids,attn_mask):
    greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
    Question =  tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return Question.strip().capitalize()

In [10]:
article = '''context: Artificial intelligence was founded as an academic discipline in 1956, and in the years since has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success and renewed funding.'''

In [11]:
start = time.time()
encoding = tokenizer.encode_plus(article, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
print(article)
output = greedy_decoding(input_ids,attention_masks)
print ("Generated Question: \n ",output)
end = time.time()
print ("\nTime elapsed ", end-start)
print ("\n")

context: Artificial intelligence was founded as an academic discipline in 1956, and in the years since has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success and renewed funding.
Generated Question: 
  What caused the "ai winter?"

Time elapsed  1.8447039127349854




In [12]:
outs = model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=40)

dec = [tokenizer.decode(ids , skip_special_tokens=True,clean_up_tokenization_spaces=True) for ids in outs]

texts = [tokenizer.decode(ids , skip_special_tokens=True,clean_up_tokenization_spaces=True) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids , skip_special_tokens=True,clean_up_tokenization_spaces=True) for ids in batch['target_ids']]

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print ("device ",device)
model = model.to(device)

In [14]:
for i in range(len(texts)):
    lines = textwrap.wrap("\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nActual question: %s" % dec[i])
    print("Predicted question: %s" % targets[i])
    print("=====================================================================\n")

 answer: New England Patriots context: The Panthers finished the regular season with a 15–1 record,
and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona
Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance
since the franchise was founded in 1995. The Broncos finished the regular season with a 12–4 record,
and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating
them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh
Steelers as one of four teams that have made eight appearances in the Super Bowl.

Actual question: Who did the Broncos deny their chance to defend their title from Super Bowl XLIX?
Predicted question: Who did Denver beat in the AFC championship?

 answer: San Francisco context: The league eventually narrowed the bids to three sites: New Orleans'
Mercedes-Benz Superdome, Miami's Sun Life Stadium, and t

In [34]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
reference  = []
candidate =[]
for i in range(len(dec)):
  a = dec[i].split(" ")
  reference.append(a)
print(reference)
scores = []
for i in targets:
  candidate = i.split()
  score = sentence_bleu(reference, candidate)
  scores.append(score)
  print(candidate)
  print(score)

[['Who', 'did', 'the', 'Broncos', 'deny', 'their', 'chance', 'to', 'defend', 'their', 'title', 'from', 'Super', 'Bowl', 'XLIX?'], ['Where', 'did', 'the', 'Bay', "Area's", "Levi's", 'Stadium', 'get', 'its', 'name?'], ['How', 'many', 'teams', 'finished', 'the', 'regular', 'season', 'with', 'a', '12–4', 'record?'], ['What', 'did', 'the', 'Broncos', 'recover', 'for', 'a', 'touchdown?'], ['What', 'was', 'the', 'title', 'of', 'the', 'game', 'at', 'which', 'time?'], ['When', 'did', 'the', 'Florida', 'legislature', 'refuse', 'to', 'approve', 'the', 'funding', 'plan', 'to', 'pay', 'for', 'the', 'renovations?'], ['In', 'what', 'year', 'did', 'the', 'National', 'Football', 'League', '(NFL)', 'determine', 'which', 'season?'], ['Which', 'stadium', 'in', 'Miami', 'was', 'the', 'most', 'likely', 'candidate', 'for', 'a', 'spot', 'in', 'the', 'NFL?'], ['Which', 'team', 'did', 'the', 'Panthers', 'defeat', '49–15', 'in', 'the', 'NFC', 'Championship', 'Game?'], ['Which', 'team', 'did', 'the', 'Panthers', 

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [35]:
print(np.mean(scores))

0.07667392897348302


In [36]:
bleu1 = []
bleu2 = []
bleu3 = []
bleu4 = []
for i in targets:
  candidate = i.split()
  bleu1.append(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
  bleu2.append(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))
  bleu3.append(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))
  bleu4.append(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))


  scores.append(score)
print("bleu1 = " , np.mean(bleu1))
print("bleu2 = " , np.mean(bleu2))
print("bleu3 = " , np.mean(bleu3))
print("bleu4 = " , np.mean(bleu4))

bleu1 =  0.6874034821677186
bleu2 =  0.4461930050933538
bleu3 =  0.28341977247618044
bleu4 =  0.07667392897348302


In [18]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert_score
  Downloading bert_score-0.3.12-py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 3.0 MB/s 
Installing collected packages: bert-score
Successfully installed bert-score-0.3.12


In [21]:
from evaluate import load
bertscore = load("bertscore")

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

In [23]:
targets

['Who did Denver beat in the AFC championship?',
 'What was the third city that was considered?',
 "What were the win/loss game stats for the Denver Bronco's regular",
 'Which Newton turnover resulted in seven points for Denver?',
 'If Roman numerals were used, what would Super Bowl 50 have been called?',
 'On what date did the Florida legislature decide against the plan to renovate the Miami stadium?',
 'Super Bowl 50 determined the NFL champion for what season?',
 'What is the name of the stadium in Miami that was considered?',
 'Who did the Panthers beat to become the NFC champs?',
 'Who did the Panthers beat in the NFC Championship Game?',
 'What venue in Miami was a candidate for the site of Super Bowl 50?',
 'Who did Carolina beat in the NFC championship game?',
 'What is the name of the stadium in San Francisco Bay Area?',
 'How many times was Cam Newton sacked?',
 'When were the finalists announced?',
 'Who won Super Bowl 50?',
 'When were the two finalists for hosting Super Bo

In [24]:
dec

['Who did the Broncos deny their chance to defend their title from Super Bowl XLIX?',
 "Where did the Bay Area's Levi's Stadium get its name?",
 'How many teams finished the regular season with a 12–4 record?',
 'What did the Broncos recover for a touchdown?',
 'What was the title of the game at which time?',
 'When did the Florida legislature refuse to approve the funding plan to pay for the renovations?',
 'In what year did the National Football League (NFL) determine which season?',
 'Which stadium in Miami was the most likely candidate for a spot in the NFL?',
 'Which team did the Panthers defeat 49–15 in the NFC Championship Game?',
 'Which team did the Panthers defeat 49–15 in the NFC Championship Game?',
 'Which stadium in Miami was the most likely candidate for a spot in the NFL?',
 'Which team did the Panthers defeat 49–15 in the NFC Championship Game?',
 'Which stadium in the San Francisco Bay Area was the most likely to be the winner?',
 'How many times did the Broncos sack 

In [25]:
predictions = targets
references = dec
results = bertscore.compute(predictions=predictions, references=references, lang="en")

In [32]:
print("Mean Precision = " , np.mean(results["precision"]))
print("Mean Recall = " , np.mean(results["recall"]))
print("Mean f1 = " , np.mean(results["f1"]))

Mean Precision =  0.9202339667826891
Mean Recall =  0.9116644002497196
Mean f1 =  0.9157126639038324
