In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 18.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 70.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 71.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration
from transformers.models.auto.processing_auto import AutoTokenizer
import pandas as pd
import time
import os
from google.colab import drive
drive.mount('/content/drive')
import torch

Mounted at /content/drive


In [None]:
class QuestionDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=30):
        self.path = os.path.join(data_dir, type_path + '.csv')
        self.ans = 'answers'
        self.inp = 'context'
        self.out = 'question'
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            answer_text, input_text, output_text = self.data.loc[idx,self.ans], self.data.loc[idx, self.inp],self.data.loc[idx, self.out]
            input_ = "answer: %s  context: %s" % (answer_text, input_text)
            target = "%s" % (output_text)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=200, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=20, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
dataset_val = QuestionDataset(tokenizer, '/content/drive/MyDrive/AIN311Project/data/QuestionG', 'valid',  max_len=400)
loader = DataLoader(dataset_val, batch_size=32, shuffle=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
it = iter(loader)

In [None]:
batch = next(it)
batch["source_ids"].shape

torch.Size([32, 200])

In [None]:
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/AIN311Project/Models/t5-base-question-generation_custom_20")


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
def greedy_decoding (inp_ids,attn_mask):
    greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
    Question =  tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return Question.strip().capitalize()

In [None]:
article = '''context: Artificial intelligence was founded as an academic discipline in 1956, and in the years since has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success and renewed funding.'''

In [None]:
start = time.time()
encoding = tokenizer.encode_plus(article, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
print(article)
output = greedy_decoding(input_ids,attention_masks)
print ("Generated Question: \n ",output)
end = time.time()
print ("\nTime elapsed ", end-start)
print ("\n")

context: Artificial intelligence was founded as an academic discipline in 1956, and in the years since has experienced several waves of optimism, followed by disappointment and the loss of funding (known as an "AI winter"), followed by new approaches, success and renewed funding.
Generated Question: 
  What caused the "ai winter?"

Time elapsed  4.429373264312744




In [None]:
outs = model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=40)

dec = [tokenizer.decode(ids , skip_special_tokens=True,clean_up_tokenization_spaces=True) for ids in outs]

texts = [tokenizer.decode(ids , skip_special_tokens=True,clean_up_tokenization_spaces=True) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids , skip_special_tokens=True,clean_up_tokenization_spaces=True) for ids in batch['target_ids']]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print ("device ",device)
model = model.to(device)

In [None]:
for i in range(len(texts)):
    lines = textwrap.wrap("\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nActual question: %s" % dec[i])
    print("Predicted question: %s" % targets[i])
    print("=====================================================================\n")

 answer: Florida legislature context: The league announced on October 16, 2012, that the two
finalists were Sun Life Stadium and Levi's Stadium. The South Florida/Miami area has previously
hosted the event 10 times (tied for most with New Orleans), with the most recent one being Super
Bowl XLIV in 2010. The San Francisco Bay Area last hosted in 1985 (Super Bowl XIX), held at Stanford
Stadium in Stanford, California, won by the home team 49ers. The Miami bid depended on whether the
stadium underwent renovations. However, on May 3, 2013, the Florida legislature refused to approve
the funding plan to pay for the renovations, dealing a significant blow to Miami's chances.

Actual question: Which state refused to approve the funding plan for the renovations at Levi's Stadium?
Predicted question: What was the entity that stepped in and caused Miami's Sun Life Stadium to no longer

 answer: Levi's Stadium context: Super Bowl 50 was an American football game to determine the
champion of the Na

In [None]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
reference  = []
candidate =[]
for i in range(len(dec)):
  a = dec[i].split(" ")
  reference.append(a)
print(reference)
scores = []
for i in targets:
  candidate = i.split()
  score = sentence_bleu(reference, candidate)
  scores.append(score)
  print(candidate)
  print(score)

[['Which', 'state', 'refused', 'to', 'approve', 'the', 'funding', 'plan', 'for', 'the', 'renovations', 'at', "Levi's", 'Stadium?'], ['Where', 'was', 'the', 'Super', 'Bowl', '50', 'game', 'played?'], ['When', 'did', 'the', 'Florida', 'legislature', 'refuse', 'to', 'approve', 'the', 'funding', 'plan', 'to', 'pay', 'for', 'the', 'renovations?'], ['Which', 'was', 'the', 'most', 'watched', 'U.S.', 'broadcast', 'ever?'], ['When', 'did', 'the', 'league', 'announce', 'that', 'the', 'two', 'finalists', 'were', 'Sun', 'Life', 'Stadium', 'and', "Levi's", 'Stadium?'], ['What', 'was', 'the', 'theme', 'of', 'the', '50th', 'Super', 'Bowl?'], ['In', 'what', 'year', 'did', 'the', 'San', 'Francisco', 'Bay', 'Area', 'last', 'host', 'a', 'Super', 'Bowl?'], ['Which', 'city', 'did', 'the', 'league', 'narrow', 'down', 'its', 'bids', 'to?'], ['Which', 'team', 'did', 'the', 'Panthers', 'defeat', '49–15', 'in', 'the', 'NFC', 'Championship', 'Game?'], ['How', 'many', 'sacks', 'did', 'Von', 'Miller', 'have', 'to'

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
