# Run the whole model on some examples in the NewsQA Dataset

## i. Preprocess the dataset and get rid of bad entries

In [None]:
import pandas as pd

df = pd.read_csv("newsqa-data-v1.csv")
df = df[~df['validated_answers'].isna()]
df = df[(df['is_question_bad'] == '0.0') & (df['is_answer_absent'] == 0)]
df

Unnamed: 0,story_id,question,answer_char_ranges,is_answer_absent,is_question_bad,validated_answers
0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,What was the amount of children murdered?,294:297|None|None,0.0,0.0,"{""none"": 1, ""294:297"": 2}"
2,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,who did say South Africa did not issue a visa ...,103:127|114:127|839:853,0.0,0.0,"{""839:853"": 1, ""103:127"": 2}"
4,./cnn/stories/13012604e3203c18df09289dfedd14cd...,What frightened the families?,690:742|688:791|630:646,0.0,0.0,"{""688:791"": 2, ""690:742"": 1}"
6,./cnn/stories/d312173b8c95cc6c206a32cc0acd8a92...,Who is hiring?,"334:345|292:297,372:379|4045:4079|301:324",0.0,0.0,"{""301:324"": 2}"
9,./cnn/stories/318f71eba1831f330d423043827aa24e...,Who is Radu Mazare?,"196:228|196:202,217:228|196:205,217:228",0.0,0.0,"{""196:228"": 2}"
...,...,...,...,...,...,...
119606,./cnn/stories/5619cb4eb06709d10613f010fb88ec55...,What does reduce the risk of HPV infection?,"0:4,102:134,12:56,4:8,56:68,79:95,8:12,95:102|...",0.0,0.0,"{""611:636"": 2}"
119607,./cnn/stories/5245fa11b56ff0161822c9b2bc038a7f...,Where did the study take place?,None|1054:1088|239:249,0.0,0.0,"{""1054:1088"": 2}"
119623,./cnn/stories/68f3087b6d588d77d02726e206fa0305...,What did she tell CNN?,"707:772|2463:2475,2987:3004|707:772,812:986",0.0,0.0,"{""707:772"": 1, ""812:986"": 2}"
119626,./cnn/stories/5e7c990b12d43b077d476413a16c05fa...,what does Soufan's book argue against?,2682:2806|2700:2806|2709:2840,0.0,0.0,"{""2709:2840"": 2}"


## ii. Get 5 articles, questions and answers for demonstration

In [None]:
import os 
import ast
import random
import io 

random.seed(14)
num_examples = 5

articles = []
questions = []
answers = []

for i in range(num_examples):
    i = random.randrange(25817)
    row = df.iloc[i]
    dir = row["story_id"]
    
    with io.open(dir, mode="r", encoding="utf-8") as file:
        f = file.read()
        f = f.replace('\n', '  ')
        
        idx = f.find("@highlight")
        article = " ".join(f[:idx].split())
        question = row["question"]

        ''' option 1: use validated_answers to get answer'''
        most_votes = 0
        answer = ""
        for k, v in ast.literal_eval(row["validated_answers"]).items():
            if k == "none":
                continue
            if v > most_votes:
                most_votes = v
                answer_range = k.split(":")
                answer_start, answer_end = int(answer_range[0]), int(answer_range[1])
                answer = f[answer_start: answer_end]

        articles.append(article)
        questions.append(question)
        answers.append(answer)
        
        # ''' option 2: use answer range to get answer'''
        # answer_cnt = 0
        # for answer_range in row["answer_char_ranges"].split('|'):
        #     if answer_range == "None":
        #         continue
            
        #     answer_cnt += 1
        #     answer = ""

        #     answer_range = answer_range.split(",")
        #     for subrange in answer_range:
        #         subrange = subrange.split(":")
        #         answer_start, answer_end = int(subrange[0]), int(subrange[1])
        #         answer += f[answer_start: answer_end]

        #     print(f"answer #{answer_cnt}: {answer}") 

## iii. For each article, use fine-tuned t5 model to generate summary

In [None]:
import os
import openai
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

model = TFT5ForConditionalGeneration.from_pretrained("t5_small_news")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
summaries = []

for article in articles:
    tokenized_input = tokenizer(
        "summarize: " + article, 
        max_length=1024, 
        truncation=True, 
        return_tensors='tf'
    )

    summary = model.generate(
        # do_sample = True,
        input_ids=tokenized_input['input_ids'],
        min_length=80,
        max_length=180,
        temperature=0.8, 
        top_k=45,
        no_repeat_ngram_size=4,
        num_beams=4,
        early_stopping=True
    )

    summary = tokenizer.decode(summary[0], skip_special_tokens=True)
    summaries.append(summary)

    print(f"article = {article[:300]} ...")
    print(f"summary = {summary}\n")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5_small_news.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


article = (CNN) -- Football legend Diego Maradona had his earrings seized by Italian authorities Friday to help pay off his back taxes, according to media reports. Diego Maradona sports expensive-looking earrings during a recent football match. The current coach of Argentina's national team was staying at a c ...
summary = Diego Maradona's earrings seized by Italian authorities to help pay off his back taxes. The football legend was staying at a clinic in northern Italy to lose weight when finance police swooped. The 48-year-old says the Serie A club should have paid the taxes. He is under intense pressure after a dismal World Cup qualifying campaign

article = DIR VALLEY, Pakistan (CNN) -- The rugged beauty of the expansive Dir Valley can mask the detail of what is happening on the ground. Pakistani soldiers look on from a mountain during a patrol in the troubled area of Maidan. In June, the Pakistani military organized a media tour to areas of the Lower  ...
summary = Pakistani milit

## iv. Reformat the summary and question to pass into GPT-3 model

In [None]:
ft_qa = "ada:ft-personal-2022-05-07-22-50-48"
def apply_ft_qa_answer(context, question, answering_model):
    prompt = f"{context}\nQuestion: {question}\nAnswer:"
    result = openai.Completion.create(model=answering_model, prompt=prompt, max_tokens=30, temperature=0, top_p=1, n=1, stop=['.','\n'])
    return result['choices'][0]['text']

In [None]:
outputs = []
for summary, question in zip(summaries, questions):
    output = apply_ft_qa_answer(summary, question, ft_qa)
    outputs.append(output)

In [None]:
for answer, output in zip(answers, outputs):
    print(f'actual answer: {answer}')
    print(f'model\'s answer: {output}\n')

actual answer: the Serie A club should have paid the taxes. 
model's answer:  the Serie A club should have paid the taxes

actual answer: Sufi Muhammad 
model's answer:  Sufi Muhammad brokered a short-lived deal between the Taliban and the government

actual answer: Rashid Rauf, 
model's answer:  The men arrested range in age from a youth in his mid- to late teens to a 41-year-old

actual answer: Erin Andrews 
model's answer:  Erin Andrews

actual answer: he regrets describing her as "wacko." 
model's answer:  he regrets describing her as "wacko" NEW: The defense attorney says he regrets describing her as "wacko" NEW: He says

