This notebook is mainly used to demonstrate out project

In [1]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, Adafactor
from transformers import T5ForConditionalGeneration,T5Tokenizer
import seaborn as sns
import matplotlib.pyplot as plt
import random
import torchvision.models as models

  from .autonotebook import tqdm as notebook_tqdm


# Question generation

In [2]:
# Lead the model
model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_boolean_questions')
tokenizer = T5Tokenizer.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        
# generator type       
def topkp_decoding (inp_ids,attn_mask):
    topkp_output = model.generate(input_ids=inp_ids,
            attention_mask=attn_mask,
            max_length=256,
            do_sample=True,
            top_k=40,
            top_p=0.80,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            early_stopping=True
           )
    Questions = [tokenizer.decode(out, skip_special_tokens=True,clean_up_tokenization_spaces=True) for out in topkp_output]
    return [Question.strip().capitalize() for Question in Questions]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
recipe = r"""
In a blender, puree all the vinaigrette ingredients until the mixture is smooth and well incorporated. Toss the arugula in a large bowl with the vinaigrette until the leaves are well coated. Arrange the persimmon slices on two serving plates, then top them with the dressed greens and serve."""

print("Recipe name: Arugula and Persimmon Salad with Pistachio Vinaigrette")
print(recipe)

set_seed(100)
Qs = []
# generate two qs for each answer
for truefalse in ['yes', 'no']:
    text = "truefalse: %s passage: %s </s>" % (truefalse, recipe)
    encoding = tokenizer.encode_plus(text, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    output = topkp_decoding(input_ids,attention_masks)
    Qs.append(output)
    print ("Question", output, "answer: ",truefalse)


Recipe name: Arugula and Persimmon Salad with Pistachio Vinaigrette

In a blender, puree all the vinaigrette ingredients until the mixture is smooth and well incorporated. Toss the arugula in a large bowl with the vinaigrette until the leaves are well coated. Arrange the persimmon slices on two serving plates, then top them with the dressed greens and serve.


  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Question ['Is arugula and persimmon the same thing?'] answer:  yes
Question ['Is persimmon vinaigrette the same as dressing?'] answer:  no


# Question generation tricky case

In [4]:
recipe2 = r"""
Bring 9 cups water to a boil in a medium pot over high. Add salt and reduce heat to medium-low. Stirring constantly with a wooden spoon, gradually stream in polenta. Cook, stirring often, until thick and creamy, 30â€“35 minutes. Remove from heat and stir in Parmesan and pepper.salt and red pepper flakes and scatter herbs over.
"""
print("Recipe name: Big-Batch Parmesan Polenta")
print(recipe2)

Recipe name: Big-Batch Parmesan Polenta

Bring 9 cups water to a boil in a medium pot over high. Add salt and reduce heat to medium-low. Stirring constantly with a wooden spoon, gradually stream in polenta. Cook, stirring often, until thick and creamy, 30â€“35 minutes. Remove from heat and stir in Parmesan and pepper.salt and red pepper flakes and scatter herbs over.



In [5]:
for seeds in [41, 43]:
    set_seed(seeds)
    
    for truefalse in ['yes', 'no']:
        text = "truefalse: %s passage: %s </s>" % (truefalse, recipe2)
        encoding = tokenizer.encode_plus(text, return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

        output = topkp_decoding(input_ids,attention_masks)
        print ("Question", output, "answer: ",truefalse)
    print()

Question ['Does polenta have to be simmered to thicken?'] answer:  yes
Question ['Does the polenta have to be melted?'] answer:  no

Question ['Does polenta have to be cooked to thaw?'] answer:  yes
Question ['Is polenta the same as pfizer polar?'] answer:  no



# Question answering

In [6]:
def encode_data(tokenizer, questions, passages, max_length):
    """Encode the question/passage pairs into features than can be fed to the model."""
    input_ids = []
    attention_masks = []

    for question, passage in zip(questions, passages):
        encoded_data = tokenizer.encode_plus(question, passage, max_length=max_length, pad_to_max_length=True, truncation_strategy="longest_first")
        encoded_pair = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]

        input_ids.append(encoded_pair)
        attention_masks.append(attention_mask)

    return np.array(input_ids), np.array(attention_masks)

In [11]:
# model.load_state_dict(torch.load('model/2ksamplesAcc55.h5'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
answering_model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/roberta-base-boolq")
answering_model.load_state_dict(torch.load('../models/robertaBool6175.h5', map_location=device))
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/roberta-base-boolq")
answering_model.to(device)

def predict(question, passage):
    sequence = tokenizer.encode_plus(question, passage, return_tensors="pt")['input_ids'].to(device)
  
    logits = answering_model(sequence)[0]
    probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
    proba_yes = round(probabilities[1], 2)
    proba_no = round(probabilities[0], 2)

    print(f"Question: {question}, Yes: {proba_yes}, No: {proba_no}")

print(recipe, "\n")
for i in Qs:
    predict(i[0], recipe)


In a blender, puree all the vinaigrette ingredients until the mixture is smooth and well incorporated. Toss the arugula in a large bowl with the vinaigrette until the leaves are well coated. Arrange the persimmon slices on two serving plates, then top them with the dressed greens and serve. 

Question: Is arugula and persimmon the same thing?, Yes: 0.4, No: 0.6
Question: Is persimmon vinaigrette the same as dressing?, Yes: 0.31, No: 0.69


<h3>Your questions about the recipe </h3>

In [12]:
while True:
    q = input("Enter your questions? ")
    if(q == "quit"):
        print("Good bye!!")
        break
    predict(q, recipe)
    print()

Enter your questions? do i need a blender
Question: do i need a blender, Yes: 0.41, No: 0.59

Enter your questions? Do you need a blender
Question: Do you need a blender, Yes: 0.47, No: 0.53

Enter your questions? blender
Question: blender, Yes: 0.52, No: 0.48

Enter your questions? is Arrange the persimmon slices on two serving plates
Question: is Arrange the persimmon slices on two serving plates, Yes: 0.53, No: 0.47

Enter your questions? are the leaves well coated
Question: are the leaves well coated, Yes: 0.43, No: 0.57

Enter your questions? well coated
Question: well coated, Yes: 0.45, No: 0.55

Enter your questions? coated
Question: coated, Yes: 0.47, No: 0.53

Enter your questions? well
Question: well, Yes: 0.43, No: 0.57

Enter your questions? quit
Good bye!!
