<h2>This models generate questions from recipes</h2>

In [1]:
import time
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up the model

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_boolean_questions')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print ("device ",device)
model = model.to(device)


# diffrent methods for generation
def greedy_decoding (inp_ids,attn_mask):
  greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
  Question =  tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
  return Question.strip().capitalize()


def beam_search_decoding (inp_ids,attn_mask):
  beam_output = model.generate(input_ids=inp_ids,
                                 attention_mask=attn_mask,
                                 max_length=256,
                               num_beams=10,
                               num_return_sequences=1,
                               no_repeat_ngram_size=2,
                               early_stopping=True
                               )
  Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
               beam_output]
  return [Question.strip().capitalize() for Question in Questions]


def topkp_decoding (inp_ids,attn_mask):
  topkp_output = model.generate(input_ids=inp_ids,
                                 attention_mask=attn_mask,
                                 max_length=256,
                               do_sample=True,
                               top_k=40,
                               top_p=0.80,
                               num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                early_stopping=True
                               )
  Questions = [tokenizer.decode(out, skip_special_tokens=True,clean_up_tokenization_spaces=True) for out in topkp_output]
  return [Question.strip().capitalize() for Question in Questions]


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
# Load pandas
import pandas as pd

# Read CSV file into DataFrame df
FileName = '../data/Food Ingredients and Recipe Dataset with Image Name Mapping.csv'
df = pd.read_csv(FileName, index_col=0)

# Keep recipes of length between 256 and 512, and clean data
cleaned_recipes = [recipe for recipe in df['Instructions'] if not isinstance(recipe, float)]
data = [recipe for recipe in cleaned_recipes if len(recipe) < 512 and len(recipe) > 256]
print(type(data))

<class 'list'>


In [8]:
len(data)

2018

In [9]:
question1 = [] # 'Yes' questions
question2 = [] # 'No' questions

# Selects the numner of quiestions to generate for each {"Yes", "No"}
NumberOfQuestions = 10
max_len = 256

for recipe in data[:NumberOfQuestions]:  
  # Generate a question that would be answered 'yes'
    truefalse = 'yes'
    text = "truefalse: %s passage: %s </s>" % (truefalse, recipe)
    encoding = tokenizer.encode_plus(text, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    output = topkp_decoding(input_ids,attention_masks)
    # print ("Output", output, truefalse)
    question1.append([output[0], truefalse])

    # Generate a question that would be answered 'no'
    truefalse = 'no'
    text = "truefalse: %s passage: %s </s>" % (truefalse, recipe)
    encoding = tokenizer.encode_plus(text, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    output = topkp_decoding(input_ids,attention_masks)
    print ("Output", output, truefalse)
    question2.append([output[0], truefalse])
  

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


Output ['Is bourbon and british whiskey the same thing?'] no
Output ['Is chamomile tea the same as agave nectar?'] no
Output ['Do you need to use grand marnier for summer?'] no
Output ['Do mangoes go in a ketchup bowl?'] no
Output ['Do you have to put shrimp in sauce?'] no
Output ['Can you make spiced syrup 1 week ahead?'] no
Output ['Does the salmon have to be cooked to cook?'] no
Output ['Is the tomato purée the same as the roe?'] no
Output ['Is it necessary to fry onions in oil?'] no
Output ['Do you have to boil the coconut cream in a rice cooker?'] no


In [10]:
# Saves the questions into vsv files
from collections import Counter

dict2 = {'Recipe': data[:NumberOfQuestions], 'Question': [q[0] for q in question1], 'label':[ans[1] for ans in question1]}
dict3 = {'Recipe': data[:NumberOfQuestions], 'Question': [q[0] for q in question2], 'label':[ans[1] for ans in question2]}
combined = {**dict2, **dict3}
df = pd.DataFrame(dict2)
df2 = pd.DataFrame(dict3)
mydict = [dict2,dict3]
# df = pd.DataFrame(mydict).to_csv('out.csv', index=False)
# saving the dataframe
df.to_csv('../data/generatedQuestionsYes.csv')
df2.to_csv('../data/generatedQuestionsNo.csv')

The yes and no questions were then manually combined after being converted to 0 for no and 1 for yes