# Preamble

In [1]:
from nltk import word_tokenize
import nltk
import requests
import stanfordnlp
import re
import json
import torch
import os

In [2]:
PATH_MSMARCO_TRAIN = "../../data/external/msmarco/train_v2.1.json"
PATH_MSMARCO_VALID = "../../data/external/msmarco/dev_v2.1.json"

PATH_NLTK_RESOURCES = "../../data/external/nltk/"
PATH_STANFORD_RESOURCES = "../../data/external/stanfordnlp/"

PATH_SAVE_DATASET = "../../data/question-answering/"

In [3]:
nltk.download('punkt', PATH_NLTK_RESOURCES)
nltk.data.path.append(PATH_NLTK_RESOURCES)

[nltk_data] Downloading package punkt to
[nltk_data]     ../../data/downloads/nltk_data/...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
stanfordnlp.download('en', PATH_STANFORD_RESOURCES)
stanford_nlp = stanfordnlp.Pipeline(processors='tokenize,pos',
                                    tokenize_pretokenized=True,
                                    models_dir=PATH_STANFORD_RESOURCES,
                                    treebank='en_ewt', use_gpu=True)

Using the default treebank "en_ewt" for language "en".
Use device: gpu
	Torch-GPU-ID: 0
---
Loading: tokenize
With settings: 
{'model_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'pretokenized': True, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [5]:
def set_seed(seed):
    # For reproducibility
    # (https://pytorch.org/docs/stable/notes/randomness.html)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
set_seed(42)

# Load MSMARCO Dataset

In [7]:
def load_dataset(path):
    json_file = open(path)
    return json.load(json_file)


def simple_format(dataset):
    questions = {}
    for key in dataset['query_id'].keys():
        sample = {'question': dataset['query'][key].replace('?', ''),
                  'answer': dataset['answers'][key],
                  'type': dataset['query_type'][key]}
        questions.update({key: sample})
    return questions

In [8]:
train = simple_format(load_dataset(PATH_MSMARCO_TRAIN))
valid = simple_format(load_dataset(PATH_MSMARCO_VALID))

In [9]:
print("MS MARCO size:")
print(f"\tTraining set: {len(train):,}")
print(f"\tValidation set: {len(valid):,}")

MS MARCO size:
	Training set: 808,731
	Validation set: 101,093


## POS-Tagging

In [10]:
def get_offset_of_tags_in_sentence(sentence, tags):
    # go from left to right and determine tag offsets
    offsets = []
    total_offset = 0
    for tag in tags:
        label = tag[0]
        local_offset = sentence.find(label)
        offset = total_offset + local_offset
        offsets.append(offset)

        # prepare for next iteration
        sentence = sentence[local_offset + len(label):]
        total_offset = offset + len(label)
    return offsets


def get_pos_tags_of_sentence(sentence):
    tags = []
    for token in sentence.tokens:
        for word in token.words:
            tags.append((word.text, word.pos))
    return tags


def calculate_pos_tags_for_string(doc):
    tags = []
    for sentence in doc.sentences:
        sentence_pos = []
        for pos in get_pos_tags_of_sentence(sentence):
            sentence_pos.append(pos)
        tags.append(sentence_pos)
    return tags

In [11]:
def pos_tagging(strings):
    batch = '\n\n'.join([' '.join(word_tokenize(string))
                         for string in strings])
    doc = stanford_nlp(batch)
    tags = calculate_pos_tags_for_string(doc)

    assert len(tags) == len(strings)

    result = []
    for i in range(len(strings)):
        offsets = get_offset_of_tags_in_sentence(strings[i], tags[i])
        result.append([(tags[i][x][0], tags[i][x][1], str(offsets[x]))
                       for x in range(len(tags[i]))])

    return result

In [12]:
def clean_sample(sample):
    return not (sample['question'] is None
                or len(sample['question'].strip()) == 0)


def calculate_pos_tags(dataset):
    strings = [sample['question'] for sample in dataset.values()
               if clean_sample(sample)]
    result = pos_tagging(strings)

    i = 0
    for sample in dataset.values():

        if sample['question'] is None or len(sample['question'].strip()) == 0:
            continue

        sample['question:POS'] = result[i]
        i += 1

In [13]:
calculate_pos_tags(train)
calculate_pos_tags(valid)

# Question Filtering

In [14]:
def is_question_without_answer(sample):
    if 'No Answer Present.' in sample['answer']:
        return True

    all_answers_are_empty = True
    for answer in sample['answer']:
        if not answer == '':
            all_answers_are_empty = False
    return all_answers_are_empty

In [15]:
def all_answers_match(regex, list_of_answers):
    matches = True
    for answer in list_of_answers:
        if not re.match(regex, answer):
            matches = False
    return matches

In [16]:
def answer_is_yes(sample):
    if is_question_without_answer(sample):
        return False
    all_answers = [s.lower() for s in sample['answer']]
    return all_answers_match("^yes,?.*$", all_answers)


def answer_is_no(sample):
    if is_question_without_answer(sample):
        return False
    all_answers = [s.lower() for s in sample['answer']]
    return all_answers_match(r"^(no$)|(no[,\.].*$)|(no\s.*$)", all_answers)


def is_binary(sample):
    return answer_is_yes(sample) or answer_is_no(sample)

In [17]:
def pos_tagging(doc):
    tags = []
    for sentence in doc.sentences:
        for pos in get_pos_tags_of_sentence(sentence):
            tags.append(pos)
    return tags

In [18]:
def generalize(question_pattern, template):
    question_pos_tags = pos_tagging(stanford_nlp(question_pattern))

    regex_string = "^"
    for question_pos in question_pos_tags:
        if question_pos[0] == "X":
            regex_string += "(.*) "
        else:
            regex_string += question_pos[0] + r"/[^\s]* "

    regex_string = regex_string[:-1] + "$"
    regex = re.compile(regex_string)
    return (question_pattern, regex, question_pos_tags, template)

In [19]:
def simple_question(question_pos_tags, pattern):
    question_elements = [tag[0].replace("/", "_").lower() + '/' + tag[1]
                         for tag in question_pos_tags]
    prepared_question = ' '.join(question_elements)
    forbidden_pos = ['IN', 'CC', 'TO', 'WDT', 'WRB', 'WP']

    match = pattern.search(prepared_question)

    if match is None:
        return False

    for group in match.groups():
        elements = group.split(" ")
        for element in elements:
            token, pos = element.split('/')
            if pos in forbidden_pos:
                return False
    return True

## Pattern Definitions 

In [20]:
ASKING_FOR_RELATION_FORWARDS = 0
ASKING_FOR_RELATION_BACKWARDS = 1

In [21]:
may_cause_patterns = []

# Can-Questions
may_cause_patterns.append(generalize(
    "can X cause X", ASKING_FOR_RELATION_FORWARDS))
may_cause_patterns.append(generalize(
    "can X be caused by X", ASKING_FOR_RELATION_FORWARDS))

# Do/Does-Questions
may_cause_patterns.append(generalize(
    "do X cause X", ASKING_FOR_RELATION_FORWARDS))
may_cause_patterns.append(generalize(
    "does X cause X", ASKING_FOR_RELATION_FORWARDS))
may_cause_patterns.append(generalize(
    "did X cause X", ASKING_FOR_RELATION_FORWARDS))

# is questions
may_cause_patterns.append(generalize(
    "is X caused by X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "are X caused by X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "is X causing X", ASKING_FOR_RELATION_FORWARDS))
may_cause_patterns.append(generalize(
    "is X a symptom of X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "is X a cause of X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "is X causes X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "is X caused from X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "is X cause for X", ASKING_FOR_RELATION_BACKWARDS))

# will questions
may_cause_patterns.append(generalize(
    "will X cause X", ASKING_FOR_RELATION_FORWARDS))

# would could
may_cause_patterns.append(generalize(
    "would X cause X", ASKING_FOR_RELATION_FORWARDS))
may_cause_patterns.append(generalize(
    "could X cause X", ASKING_FOR_RELATION_FORWARDS))

# others
may_cause_patterns.append(generalize(
    "X caused by X", ASKING_FOR_RELATION_BACKWARDS))
may_cause_patterns.append(generalize(
    "X causes X", ASKING_FOR_RELATION_BACKWARDS))

In [22]:
len(may_cause_patterns)

18

In [23]:
may_cause_patterns.sort(key=lambda x: len(x[0]), reverse=True)

## Question Encoding

In [24]:
def extract_concept(question_pos, pattern):
    pattern_tokens = [t[0] for t in pattern[2]]
    pattern_tokens.remove('X')

    tags = []
    for i in range(len(question_pos)):
        pos = question_pos[i]
        pos_conditions = pos[1] in ['DT', 'PRP$'] and pos[0].lower() != 'no'

        if pos[0].lower() in pattern_tokens:
            tags.append(0)
        elif (i == 0 or tags[i-1] == 0) and pos_conditions:
            tags.append(0)
        else:
            tags.append(1)

    annotation = []
    start = -1
    for i in range(len(tags) + 1):

        if i == len(tags) and start > -1:
            annotation.append([start, i-1])
            start = -1
            continue

        if i == len(tags):
            break

        if tags[i] == 0 and start > -1:
            annotation.append([start, i-1])
            start = -1
            continue

        if tags[i] == 1 and start == -1:
            start = i
            continue
    return annotation

In [25]:
def get_causal_questions_by_regex(dataset):
    result = {}

    for key in dataset:
        sample = dataset[key]

        for pattern in may_cause_patterns:
            if simple_question(sample['question:POS'], pattern[1]):
                sample['template'] = pattern[3]
                sample['concepts'] = extract_concept(
                    sample['question:POS'], pattern)

                if len(sample['concepts']) == 2:
                    result.update({key: sample})
                break

    return result

In [26]:
questions_train = get_causal_questions_by_regex(train)
questions_valid = get_causal_questions_by_regex(valid)

In [27]:
print(len(questions_train))
print(len(questions_valid))
print(len(questions_train) + len(questions_valid))

3960
489
4449


In [28]:
def create_queries(dataset):
    for sample in dataset.values():
        if sample['template'] == ASKING_FOR_RELATION_FORWARDS:
            sample['query'] = (sample['concepts'][0], sample['concepts'][1])
        elif sample['template'] == ASKING_FOR_RELATION_BACKWARDS:
            sample['query'] = (sample['concepts'][1], sample['concepts'][0])

In [29]:
create_queries(questions_train)
create_queries(questions_valid)

# Answer Encoding

In [30]:
def extract_answers(dataset):

    new_dataset = []

    for sample in dataset.values():
        sample['answer:Extracted'] = []

        if answer_is_yes(sample):
            sample['answer:Extracted'].append("Yes")
            new_dataset.append(sample)
            continue

        if answer_is_no(sample):
            sample['answer:Extracted'].append("No")
            new_dataset.append(sample)
            continue

        if is_question_without_answer(sample):
            sample['answer:Extracted'].append("No Answer Present.")
            new_dataset.append(sample)
            continue

    return new_dataset

In [31]:
final_qa_dataset_train = extract_answers(questions_train)
final_qa_dataset_valid = extract_answers(questions_valid)

In [32]:
number_binary = 0
for sample in final_qa_dataset_train + final_qa_dataset_valid:
    if is_binary(sample):
        number_binary += 1

print(number_binary)

2169


In [33]:
len(final_qa_dataset_train) + len(final_qa_dataset_valid)

4287

# Entity Linking

In [34]:
def disambiguate(text):
    url = 'http://localhost:2222/rest/disambiguate/'
    headers = {'accept': 'application/json'}
    payload = {'text': text, 'confidence': '0.4', 'support': "0"}
    return requests.get(url, params=payload, headers=headers).json()


def entity_linking(text, offset):
    query = "<annotation text=\"" + text + "\">\n"
    query += "\t<surfaceForm name=\"" + text
    query += "\" offset=\"" + str(offset) + "\" />\n"
    query += "</annotation>"

    result = disambiguate(query)
    if result == "" or 'Resources' not in result.keys():
        return None
    return [r['@URI'] for r in result['Resources']][0]

In [35]:
def get_query(sample, concept_range):
    start = int(sample['question:POS'][concept_range[0]][2])
    end = int(sample['question:POS'][concept_range[1]][2])
    end += len(sample['question:POS'][concept_range[1]][0])
    return sample['question'][start:end], start

In [36]:
'''
    Start dbpedia spotlight
    (more instructions in README.md)

    $ cd ../../data/downloads/dbpedia-spotlight/
    $ java -jar -Xmx30G -Xms30G dbpedia-spotlight-1.0.0.jar en \
    http://localhost:2222/rest
'''
expected_entity = 'http://dbpedia.org/resource/Tobacco_smoking'
assert entity_linking("Tobacco Smoking", 0) == expected_entity

In [37]:
for sample in final_qa_dataset_train + final_qa_dataset_valid:
    cause_entity = entity_linking(*get_query(sample, sample['query'][0]))
    effect_entity = entity_linking(*get_query(sample, sample['query'][1]))
    sample['entities:dbo'] = [cause_entity, effect_entity]

# Save

In [38]:
def save(dataset, name):
    jsonarray = json.dumps(dataset)
    dataset_file = open(PATH_SAVE_DATASET + name, "w+")
    dataset_file.write(jsonarray)
    dataset_file.close()

In [39]:
save(final_qa_dataset_train, "causality-qa-training.json")
save(final_qa_dataset_valid, "causality-qa-validation.json")