In [1]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

qa_tok = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
# qa_tok = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")
# qa_model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

def answer_question(question, context, tokenizer=qa_tok, model=qa_model):
    input_str = f"[CLS] {question} [SEP] {context} [SEP]" 
    inputs = tokenizer(input_str, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    answer = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    
    return {"question": question, "answer": answer, "start_index": answer_start_index}



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import pandas as pd

QANTA_TRAIN = open("qanta.train.2018.04.18.json")
WIKI_LOOKUP = open("wiki_lookup.json")

data = json.load(QANTA_TRAIN)
wiki = json.load(WIKI_LOOKUP)
df = pd.json_normalize(data["questions"])

In [26]:
from nltk.tokenize import sent_tokenize, word_tokenize
from keybert import KeyBERT
from textsearch import TextSearch

CONTEXT_SIZE = 512
CONTEXT_STRIDE = 430

def chunkz(text, size=CONTEXT_SIZE, stride=CONTEXT_STRIDE):
    chunks = []
    start, end = 0, size
    
    while end < len(text):
        chunks.append(text[start:end])
        start += stride
        end += stride
    
    chunks.append(text[start:-1])
    return chunks

kw_model = KeyBERT()

def query_chunks(query, chunks, num_hits=5):
    keywords = kw_model.extract_keywords(query)
    hits = []
    
    if keywords == []: return []
    for i, chunk in enumerate(chunks):
        ts = TextSearch(case="ignore", returns="match")
        for keyword, prob in keywords: ts.add(keyword)
        res = ts.findall(chunk)
        
        hits.append((len(res), i))
    
    return sorted(hits, key=lambda tup: tup[0]-tup[1]/100, reverse=True)[:num_hits]
        
from transformers import AutoModelWithLMHead, AutoTokenizer
import string

toQ_tok = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
toQ_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

def get_question(answer, context, tokenizer=toQ_tok, model=toQ_model, max_length=200):
    input_text = "answer: %s  context: %s </s>" % (answer, context)
    features = tokenizer([input_text], return_tensors='pt')

    output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)

    return tokenizer.decode(output[0])[15:-4]

def is_correct_answer(real_answer, model_answer):
    real_answer = real_answer.translate(str.maketrans('', '', string.punctuation))
    model_answer = model_answer.translate(str.maketrans('', '', string.punctuation))
    if real_answer in model_answer or model_answer in real_answer:
        return True
    return False

import openai
openai.api_key = "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"

def get_gpt_clues(topic):
    prompt = f"Give a numbered list of 5 sentences about '{topic}' consisting of 5 clues (1 sentence per clue) without mentioning '{topic}' in any of them. '{topic}' must not be mentioned. The first clue should be very hard such that an expert may not know it, and the last clue should still be somewhat hard but easier. Thus, clues should be in descending order of difficulty. Make sure that the numbered list is in the format 1: 'clue'"
    completions = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=2048, n=1,stop=None,temperature=0.2)
    result = completions.choices[0].text
    return [ele[3:] for ele in result.split("\n") if ele != ""]
# print(get_gpt_clues("The Last Supper"))

def evaluate_question(clues, contexts):
    ret = []
    for i, clue in enumerate(clues):
#         clue_as_question = get_question(answer=df_answer, context=clue)
        print(f"\tClue {i}: {clue}")
#         print(f"\tClue as Question: {clue_as_question}")
        
        hits = query_chunks(clue, contexts, num_hits=8)
#         print(hits)

        found_answer = False
        for contained_kw, chunk_num in hits:
            results = answer_question(clue, contexts[chunk_num])
            answer = results["answer"]
#             print(chunk_num, answer)
            
#             if answer != "" and is_correct_answer(real_answer=df_wiki_title, model_answer=answer):
            if answer != "":
                found_answer = True
                start_idx = results["start_index"] + chunk_num*CONTEXT_STRIDE
                print(f"\t\tGuessed Answer: {answer}") #| Expected Answer: {df_answer}")
                print(f"\t\tStart Index: {start_idx}")
                print(f"\t\tChunk: {contexts[chunk_num]}")
                ret.append({"answer":answer, "start_index": int(start_idx)})
                break
        if found_answer == False: ret.append({"answer":"", "start_index": ""})
    return ret

def evaluate_row(row):
    df_wiki_title = df["page"][row]
    df_question = df["text"][row]
    df_answer = df["answer"][row]
    
    print(f"================================ Row: {row} ================================")
    print(f"Answer: {df_answer}")
    print(f"Question:{df_question}")
    print(f"Wiki page: {df_wiki_title}")

    full_wiki = wiki[df_wiki_title]["text"]
    contexts = chunkz(full_wiki)
    
#     print(context)
    clues = sent_tokenize(df_question)
#     print("================================")
    human_result = evaluate_question(clues, contexts)
    topic = df_wiki_title.translate(str.maketrans('', '', string.punctuation))
    gpt_clues = get_gpt_clues(topic)
    computer_results = evaluate_question(gpt_clues, contexts)
    return {"human": human_result, "computer": computer_results}


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [27]:
ms_df = df[df["difficulty"] == "MS"]
drop_indices = []
for index, row in ms_df.iterrows():
    if row["page"] not in wiki:
        drop_indices.append(index)
ms_df = ms_df.drop(index=drop_indices)

new_columns = [f"{author}_answer_{i}" for i in range(6) for author in ["human", "computer"]] + \
                [f"{author}_start_index_{i}" for i in range(6) for author in ["human", "computer"]]
ms_df = ms_df.reindex(columns = ms_df.columns.tolist() + new_columns)


count = 0
for index, row in ms_df.iterrows():
    count += 1
    print(index)
    if count % 10 == 0: print(f"============== {count} ==============")
    results = evaluate_row(index)
    for author in results:
        for i, result_dict in enumerate(results[author]):
            ms_df.loc[index, f"{author}_answer_{i}"] = result_dict["answer"]
            ms_df.loc[index, f"{author}_start_index_{i}"] = result_dict["start_index"]


18737
Answer: {osmosis} [prompt on {diffusion}]
Question:A pressure associated with this process is a colligative ("co-LIG-uh-tiv") property proportional to mass and temperature, and is defined as the hydrostatic ("HY-dro-STAT-ick") pressure necessary to counteract this process. A "reverse" form of this process is sometimes used to purify water. For 10 points, name this movement of water across a semi-permeable membrane from an area of high concentration to an area of low concentration.
Wiki page: Osmosis
	Clue 0: A pressure associated with this process is a colligative ("co-LIG-uh-tiv") property proportional to mass and temperature, and is defined as the hydrostatic ("HY-dro-STAT-ick") pressure necessary to counteract this process.
		Guessed Answer: osmotic
		Start Index: 534
		Chunk: oncentrations. Osmosis can be made to do work.

Osmotic pressure is defined as the external pressure required to be applied so that there is no net movement of solvent across the membrane. Osmotic pressu

ServiceUnavailableError: The server is overloaded or not ready yet.

In [6]:
ms_df.columns.to_list()

['text',
 'answer',
 'page',
 'category',
 'subcategory',
 'tournament',
 'difficulty',
 'year',
 'proto_id',
 'qdb_id',
 'dataset',
 'qanta_id',
 'tokenizations',
 'first_sentence',
 'gameplay',
 'fold',
 'human_answer_0',
 'computer_answer_0',
 'human_answer_1',
 'computer_answer_1',
 'human_answer_2',
 'computer_answer_2',
 'human_answer_3',
 'computer_answer_3',
 'human_answer_4',
 'computer_answer_4',
 'human_answer_5',
 'computer_answer_5',
 'human_start_index_0',
 'computer_start_index_0',
 'human_start_index_1',
 'computer_start_index_1',
 'human_start_index_2',
 'computer_start_index_2',
 'human_start_index_3',
 'computer_start_index_3',
 'human_start_index_4',
 'computer_start_index_4',
 'human_start_index_5',
 'computer_start_index_5']

In [7]:
columns = ['text',
 'answer',
 'page',
 'category',
 'difficulty',
 'qanta_id',
 'fold',
 'human_answer_0',
 'computer_answer_0',
 'human_answer_1',
 'computer_answer_1',
 'human_answer_2',
 'computer_answer_2',
 'human_answer_3',
 'computer_answer_3',
 'human_answer_4',
 'computer_answer_4',
 'human_answer_5',
 'computer_answer_5',
 'human_start_index_0',
 'computer_start_index_0',
 'human_start_index_1',
 'computer_start_index_1',
 'human_start_index_2',
 'computer_start_index_2',
 'human_start_index_3',
 'computer_start_index_3',
 'human_start_index_4',
 'computer_start_index_4',
 'human_start_index_5',
 'computer_start_index_5']

ms_df.head(580).to_csv(path_or_buf="ms.csv", columns=columns)

In [25]:
ms_df.head(10)

Unnamed: 0,text,answer,page,category,subcategory,tournament,difficulty,year,proto_id,qdb_id,...,human_start_index_1,computer_start_index_1,human_start_index_2,computer_start_index_2,human_start_index_3,computer_start_index_3,human_start_index_4,computer_start_index_4,human_start_index_5,computer_start_index_5
18737,A pressure associated with this process is a c...,{osmosis} [prompt on {diffusion}],Osmosis,Science,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550bca1,,...,2656.0,3026.0,3043.0,5211.0,,2206.0,,16.0,,
18801,This artist's statues of a dying slave and a h...,{Michelangelo} di Lodovico {Buonarroti} Simoni...,Michelangelo,Fine Arts,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550bce4,,...,34911.0,19.0,11272.0,18.0,937.0,28049.0,4833.0,2586.0,,
19169,The size of these objects is given by the Schw...,{black holes},Black_hole,Science,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550be74,,...,50350.0,32.0,24946.0,25041.0,897.0,4374.0,,43023.0,,
19683,This character was named after the wife of Kin...,Hermione Granger [accept either],Hermione_Granger,Literature,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c0de,,...,1349.0,23698.0,35.0,474.0,23732.0,16012.0,,19841.0,,
19771,Three square windows can be seen in the back o...,The {Last Supper},Last_Supper,Fine Arts,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c140,,...,6484.0,12943.0,21572.0,455.0,,4324.0,,78.0,,
19772,"The ""law of"" these objects states that the tor...",levers,Lever,Science,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c141,,...,4308.0,27.0,99.0,4390.0,,2196.0,,3482.0,,4393.0
19773,Landmarks on this island include the El Yunque...,Puerto Rico,Puerto_Rico,Geography,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c142,,...,43002.0,1756.0,34845.0,475.0,68840.0,7755.0,,9479.0,,
19774,"While travelling back to his hometown, this ma...",{Oedipus},Oedipus,Literature,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c143,,...,3947.0,2236.0,4901.0,891.0,2252.0,9485.0,,9898.0,,
19775,"Policies during this war included the ""strateg...",{Vietnam} War,Vietnam_War,History,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c144,,...,8603.0,25.0,39601.0,18.0,,504.0,,104.0,,
19776,"The DNA in this organelle (""or-guh-NELL"") is i...",{mitochondria} (“ MY-toe-KON-dree-uh ”) [or {m...,Mitochondrion,Science,,Collaborative MS Tournament,MS,2010,5476992eea23cca90550c145,,...,11706.0,33147.0,49094.0,9976.0,33144.0,1760.0,,23.0,,


In [19]:
ms_df[["computer_start_index_0", "computer_start_index_1", "computer_start_index_2", "computer_start_index_3", "computer_start_index_4", "computer_start_index_5"]].head(580).describe(include="all")

Unnamed: 0,computer_start_index_0,computer_start_index_1,computer_start_index_2,computer_start_index_3,computer_start_index_4,computer_start_index_5
count,580.0,580.0,579.0,579.0,579.0,72.0
unique,,,427.0,440.0,420.0,67.0
top,,,21.0,15.0,16.0,15.0
freq,,,15.0,11.0,16.0,3.0
mean,6460.103448,6661.686207,,,,
std,13392.165147,11588.909732,,,,
min,2.0,8.0,,,,
25%,20.0,23.0,,,,
50%,101.0,1146.0,,,,
75%,5280.25,7757.5,,,,


In [24]:
ms_df[["human_start_index_0", "human_start_index_1", "human_start_index_2", "human_start_index_3", "human_start_index_4", "human_start_index_5"]].head(580).describe(include="all")

Unnamed: 0,human_start_index_0,human_start_index_1,human_start_index_2,human_start_index_3,human_start_index_4,human_start_index_5
count,580.0,580.0,575.0,418.0,114.0,9.0
unique,503.0,522.0,499.0,,100.0,
top,23.0,2.0,35.0,,19.0,
freq,6.0,7.0,7.0,,4.0,
mean,,,,9903.889952,,6575.777778
std,,,,14279.960692,,10903.969172
min,,,,2.0,,13.0
25%,,,,52.75,,23.0
50%,,,,3903.5,,505.0
75%,,,,14249.25,,9920.0


In [21]:
ms_df["human_start_index_5"].head(580)

18737   NaN
18801   NaN
19169   NaN
19683   NaN
19771   NaN
         ..
40068   NaN
40069   NaN
40070   NaN
40071   NaN
40072   NaN
Name: human_start_index_5, Length: 580, dtype: float64

In [None]:
import openai
openai.api_key = "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"

def get_gpt_clues(topic):
    prompt = f"Give a numbered list of 5 sentences about '{topic}' consisting of 5 clues (1 sentence per clue) without mentioning '{topic}' in any of them. '{topic}' must not be mentioned. The first clue should be very hard such that an expert may not know it, and the last clue should still be somewhat hard but easier. Thus, clues should be in descending order of difficulty. Make sure that the numbered list is in the format 1: 'clue'"
    completions = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=2048, n=1,stop=None,temperature=0.2)
    result = completions.choices[0].text
    return [ele[3:] for ele in result.split("\n") if ele != ""]
# print(get_gpt_clues("The Last Supper"))