In [21]:
import json
import requests

def get_evidences(question,max_evidences=3):
  params = {}
  params['q'] = question
  params['max'] = str(max_evidences)
  params['wiki'] = "false"
  params['dbpedia'] = 'false'
  params['d4c'] = "true"
  #r = requests.get('http://drugs4covid.oeg.fi.upm.es/qa/answers', params=params)  
  try:
    r = requests.get('http://localhost:8000/answers', params=params)  
    return r.json()
  except e:
    print("error getting evidences: " + str(e))
    return []  

r = get_evidences("What is the name of the vaccine that will speed up immunization?",5)
print(json.dumps(r, ensure_ascii=False, indent=4))


[
    {
        "type": "literal",
        "answer": "Covid-19",
        "confidence": 0.5,
        "evidence": "Four databases were linked to construct a surveillance-based cohort of adults 18 years of age or older residing in New York State. 10 The Citywide Immunization Registry (CIR) collects and stores all data on Covid-19 vaccine administration for persons residing in New York City, and the New York State Immunization Information System (NYSIIS) collects data for the rest of the state (excluding data that are reported directly to the federal system, such as data for veterans and military personnel and data from the American Indian Health Program). The Electronic Clinical Laboratory Reporting System (ECLRS) collects all reportable Covid-19 test results (nucleic acid amplification testing or antigen testing) in New York State. 19 The Health Electronic Response Data System (HERDS) includes a statewide, daily electronic survey of all inpatient facilities in New York State, which colle

In [2]:
from transformers import pipeline

summarizer = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")

def summarize(text,max_size=150,min_size=50):
    text_length = len(text.split(" "))
    if (text_length > 1024):
        return "empty"
    if (text_length < 10):
        return text
    if (text_length < max_size):
        max_size = text_length - int(text_length/4)
    if (max_size < min_size):
        min_size = max_size
    if (min_size > max_size):
        max_size = min_size
    summary = summarizer(text, max_length=max_size, min_length=min_size, do_sample=False)
    return summary[0]['summary_text'].strip()

print(summarize('discover possible antiviral treatment for COVID-19',20))

discover possible antiviral treatment for COVID-19


In [4]:
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer

q_model_name = "allenai/t5-small-squad2-question-generation"
q_tokenizer = T5Tokenizer.from_pretrained(q_model_name)
q_model = T5ForConditionalGeneration.from_pretrained(q_model_name)

def get_question(input_string, **generator_args):
    input_ids = q_tokenizer.encode(input_string, return_tensors="pt")
    res = q_model.generate(input_ids, **generator_args)
    output = q_tokenizer.batch_decode(res, skip_special_tokens=True)
    return output

print(get_question("We Covering exactly 20 years, the survey offers a unique opportunity to understand."))

['What is the survey of the survey?']


In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

ner_tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
ner_model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
ner_nlp = pipeline('ner', model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

def get_entities(text):
    if (len(text)<1):
        return []
    result =  ner_nlp(text)
    if (len(result)>0):
        for r in result:
            r['word'] = r['word'].strip()
    return result

sample_text = "British researchers"
print(get_entities(sample_text))


[{'entity_group': 'MISC', 'score': 0.9998902, 'word': 'British', 'start': 0, 'end': 7}]


In [5]:
def get_contexts(template):
    if ('What' not in template):
        return []
    concepts = template['What']
    if ('How' in template):
        concepts.extend(template['How'])
    contexts = []
    for concept in concepts:
        sentence = ""
        #if ('Who' in template):
        #    sentence += " and ".join(template['Who'])
        sentence += " " + concept + " "
        if ('Where' in template):
            sentence += "in " + " and ".join(template['Where'])
        if ('When' in template):
            sentence += " and ".join(template['When'])
        contexts.append(sentence.strip())
    return contexts

input_template = {'Who': ['British'], 'What': ['discover possible antiviral treatment for COVID-19']}
print(get_contexts(input_template))


['discover possible antiviral treatment for COVID-19']


In [15]:
def get_questions(contexts):
    questions = []
    for context in contexts:
        summary = summarize (context,max_size=50)
        #print("Summary:",summary)
        question1 = get_question(summary)[0]
        #print("Question1:",question1)
        question2 = get_question(context)[0]
        #print("Question2:",question2)
        if ('?' in question1) and (question1 not in questions):
            questions.append(question1)
        if ('?' in question2) and (question2 not in questions):
            questions.append(question2)
    return questions
        
print(get_questions([
    'The main cause of death in Mexico is hospital saturation',
    'The main cause of death in is hospital saturation'
                    ]))

["What is Mexico's main cause of death?", 'What is the main cause of death in Mexico?', 'What is the main cause of death in hospital?']


In [7]:
import spacy
import en_core_web_sm
spacy_nlp = en_core_web_sm.load()
#spacy_nlp = spacy.load("en_core_web_sm")

def is_valid(evidence,query_template):
    doc = spacy_nlp(evidence)
    require_validation = False
    ref_entities = []
    if ("Who" in query_template):        
        ref_entities.extend(query_template['Who'])
        require_validation = True
    if ("Where" in query_template):
        ref_entities.extend(query_template['Where'])
        require_validation = True
    if ("When" in query_template):
        ref_entities.extend(query_template['When'])
        require_validation = True
    if (not require_validation):
        return True
    for sent in doc.sents:
        for e in get_entities(sent.text):
            entity_word = e['word']
            if (entity_word in ref_entities):
                return True
    print("INVALID evidence:",evidence,"for",query_template)
    return False

print(is_valid("We Covering exactly 20 years, the survey offers a unique opportunity to understand the long-term dynamics of the Egyptian labour market and its reactions to policy changes (in our case trade policy). The survey is composed of three sections: (i) households; (ii) individuals; (iii) income. The first section, the household questionnaire, is administrated only by the household's head or by the head's spouse. It contains questions on basic demographic characteristics of the members of the household, movements of the household's members as well as questions regarding the ownership of assets and durable goods. The second section, the individual questionnaire, includes questions to which each person answers individually, concerning the educational background, employment and unemployment conditions and its reasons, average wage, job characteristics, mobility, job search activities, migration stories and a complete section on women's work, their condition in the households and fertility. The 2018 wave dedicates more attention to measures of job stability, given the recent trends of the country towards higher precariousness. The third section encompasses all possible sort of income sources, from family-run agricultural and non-agricultural enterprises to transfers and remittances. The survey is representative at the national level. The ELMPS covers of the whole country, dividing it into six different macro-regions: Greater Cairo, Alexandria, Urban Lower Egypt, Urban Upper Egypt, Rural Lower Egypt and Rural Upper Egypt, with the only exception of the Frontier governorates. The final sample included 15,746 households and 61,231 individuals.",{"Who":["Egyptian"]}))



True


In [16]:
import numpy as np
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embedding(sentence):
    text = [sentence]
    embedding = sentence_model.encode(text)
    return embedding[0]

def get_cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

v1 = get_embedding('Janssen asks the EU to approve a single-dose vaccine that will speed up immunization')
v2 = get_embedding('What is Janssen asking the EU to approve?')
v3 = get_embedding('What is the name of the vaccine that will speed up immunization?')

print(get_cosine(v1,v2))
print(get_cosine(v1,v3))

0.6088888
0.64697105


In [14]:
def get_evidences_from_annotated_new_item(annotation):
    new_text = annotation['text_en']
    #ref_embedding = get_embedding(ref_text)
    statements = []
    for w in a['5w1h_value']:
        if (w['5w1h_label'] != 'What'):
            statements.append(new_text.replace(w['5w1h_text_en'],""))
    print(statements)

In [None]:
import json 
import time
from random import randint
from time import sleep

input_name = "covid_5w1h_dataset_170_news_english"
input_file = open('../claims/'+input_name+'.json')
data = json.load(input_file)
total = len(data)
counter = 0
try: 
    for i in data:
        print("[",counter,"/",total,"]",i['id'],":")
        counter += 1        
        total_annotations = len(i['annotation_values'])
        current_annotation = 0
        question_similarity_threshold = 0.6
        for a in i['annotation_values']:
            get_evidences_from_annotated_new_item(a) 
        
except e:
    print("Error on execution: " + str(e))
# Closing file
input_file.close()

In [None]:
import json 
import time
from random import randint
from time import sleep

#input_name = "covid_5w1h_english"
input_file = open('../claims/'+input_name+'.json')
output_file = open("../evidences/"+input_name+"_evidences.json", "w")
output_file.write("[\n")
data = json.load(input_file)
total = len(data)
counter = 0
try: 
    for i in data:
        print("[",counter,"/",total,"]",i['id'],":")
        counter += 1        
        total_annotations = len(i['annotation_values'])
        current_annotation = 0
        question_similarity_threshold = 0.6
        for a in i['annotation_values']:
            current_annotation += 1
            print("\t annotation",current_annotation,"/",total_annotations,":",a['text_en'])
            query_template = {}
            questions = {}
            evidences = {}
            ref_text = a['text_en']
            ref_embedding = get_embedding(ref_text)
            query_sentence = ref_text
            for q in get_questions([query_sentence]):
                if (q not in questions):
                    vector = get_embedding(q)
                    similarity = get_cosine(ref_embedding,vector)
                    if (similarity > question_similarity_threshold):
                        questions[q]=similarity                                                                    
            for w in a['5w1h_value']:
                question_type =  w['5w1h_label']
                question_values = query_template.get(question_type)
                if (question_values is None):
                    question_values = []
                value = w['5w1h_text_en'].replace(".","")
                if (question_type == "Who"):
                    for e in get_entities(value):
                        if (e['entity_group'] == 'ORG') or (e['entity_group'] == 'PER'):
                            question_values.append(e['word'])
                elif (question_type == "Where"):
                    for e in get_entities(value):
                        if (e['entity_group'] == 'LOC') :
                            question_values.append(e['word'])
                else:
                    question_values.append(value)
                if (len(question_values)>0):
                    query_template[question_type]=question_values
                query_sentence = a['text_en'].replace(w['5w1h_text_en'],'')
                for q in get_questions([query_sentence]):
                    if (q not in questions):
                        vector = get_embedding(q)
                        similarity = get_cosine(ref_embedding,vector) 
                        questions[q]=similarity
            answers = []
            sorted_questions = sorted(questions, key=questions.get, reverse=True)
            a['questions']=[]
            question_threshold = 0.6
            for q in sorted_questions:
                if (questions[q]>question_threshold):
                    a['questions'].append(q)
            a['evidences']=[]
            for question in a['questions']:
                print("\t\t\tquestion:",question,questions[question])
                for e in get_evidences(question,2):
                    if (is_valid(e['evidence'],query_template)):
                        if (e['evidence'] not in a['evidences']):
                            a['evidences'].append(e['evidence'])
                            print("-> [",e['confidence'],"]",e['evidence'])
                sleep(randint(1,10))
        json_string = json.dumps(i, ensure_ascii=False, indent=4)
        output_file.write(json_string)
        if (counter < total):
            output_file.write(",")
        output_file.write("\n")
except e:
    print("Error on execution: " + str(e))
output_file.write("\n]")

# Closing file
input_file.close()
output_file.close()