# Preamble

In [1]:
from flair.models import SequenceTagger
from flair.data import Sentence, Token
from nltk import word_tokenize, pos_tag
import stanfordnlp
import nltk
import json
import re

In [2]:
PATH_WIKIPEDIA_EXTRACTION = "../../data/causality-graphs/extraction/"
PATH_WIKIPEDIA_EXTRACTION += "wikipedia/wikipedia-extraction.tsv-backup"
PATH_FLAIR_FOLDER = "../../data/flair-models/infoboxes/"

PATH_NLTK_RESOURCES = "../../data/external/nltk/"
PATH_STANFORD_RESOURCES = "../../data/external/stanfordnlp/"

PATH_OUTPUT_GRAPH = "../../data/causality-graphs/spotting/"
PATH_OUTPUT_GRAPH += "wikipedia/infobox-graph.json"

In [3]:
nltk.download('punkt', PATH_NLTK_RESOURCES)
nltk.download('averaged_perceptron_tagger', PATH_NLTK_RESOURCES)
nltk.data.path.append(PATH_NLTK_RESOURCES)

[nltk_data] Downloading package punkt to ../../data/external/nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ../../data/external/nltk/...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
stanfordnlp.download('en', PATH_STANFORD_RESOURCES)
stanford_nlp = stanfordnlp.Pipeline(processors='tokenize,pos',
                                    models_dir=PATH_STANFORD_RESOURCES,
                                    treebank='en_ewt',
                                    use_gpu=True)

Using the default treebank "en_ewt" for language "en".
Use device: gpu
	Torch-GPU-ID: 0
---
Loading: tokenize
With settings: 
{'model_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


# Loading infobox data

In [5]:
infobox_set = []

for line in open(PATH_WIKIPEDIA_EXTRACTION, encoding="utf-8"):
    parts = line.strip().split('\t')
    if parts[0] != 'wikipedia_infobox':
        continue

    assert len(parts) == 9
    infobox_data = {
        "value": json.loads(parts[8]),
        "type": "wikipedia_infobox",
        "payload": {
            "wikipedia_page_id": parts[1],
            "wikipedia_page_title": parts[2],
            "wikipedia_revision_id": parts[3],
            "wikipedia_revision_timestamp": parts[4],
            "infobox_template": parts[5][1:-1],
            "infobox_title": parts[6][1:-1],
            "infobox_argument": parts[7][1:-1]
        }
    }
    infobox_set.append(infobox_data)

# Preprocessing

In [6]:
def is_valid_article(title):
    forbidden_title_parts = ['Wikipedia:', 'Template:', 'File:',
                             'Portal:', 'Category:', 'Draft:',
                             'List of', 'disambiguation']

    contains_forbidden_title_part = False
    for forbidden_title_part in forbidden_title_parts:
        if forbidden_title_part in title:
            contains_forbidden_title_part = True
            break

    return not contains_forbidden_title_part

In [7]:
def is_causal_infobox(infobox_template):
    not_causal = ['infobox former arab villages in palestine',
                  'infobox soap character',
                  'infobox serial killer',
                  'infobox criminal',
                  'infobox mass murderer',
                  'infobox murderer']
    return infobox_template.lower() not in not_causal

In [8]:
infobox_data_filtered = []
statistics = {}

for infobox in infobox_set:
    if not is_valid_article(infobox['payload']['wikipedia_page_title']):
        continue

    if not is_causal_infobox(infobox['payload']['infobox_template']):
        continue

    if infobox['payload']['infobox_argument'] == "result":
        continue

    template = infobox['payload']['infobox_template'].lower()
    page_id = infobox['payload']['wikipedia_page_id']
    statistics.setdefault(template, []).append(page_id)

    infobox_data_filtered.append(infobox)

In [9]:
considered_infoboxes = []

for template in sorted(statistics.items(),
                       key=lambda statistics: len(set(statistics[1])),
                       reverse=True):
    if len(set(template[1])) >= 10:
        considered_infoboxes.append(template[0])

considered_infoboxes = set(considered_infoboxes)
considered_infoboxes

{'infobox birth control',
 'infobox bus accident',
 'infobox civil conflict',
 'infobox event',
 'infobox medical condition (new)',
 'infobox military conflict',
 'infobox news event',
 'infobox oil spill',
 'infobox rail accident',
 'infobox wildfire'}

In [10]:
def preprocess(string):

    # remove newlines
    string = '; '.join([s.strip()
                        for s in string.split('\n')
                        if s.strip() != ''])

    # remove brackets
    string = re.sub(r"\s?\(.*\)\s?", "", string).strip()

    # Remove leading list headings like "Initially: ... Later: ..."
    # heading at beginning
    string = re.sub(r"^\w+:\s?", "", string).strip()
    # heading within the string
    string = re.sub(r"\w+:\s?", "", string).strip()

    return string

In [11]:
infoboxes_for_spotting = []

for infobox in infobox_data_filtered:
    template = infobox['payload']['infobox_template'].lower()
    if template not in considered_infoboxes:
        continue

    infobox['value'] = preprocess(infobox['value'])

    if len(infobox['value']) == 0:
        continue

    result = {'value': infobox['value'],
              "sources": [{'type': infobox['type'],
                           'payload': infobox['payload']}]}
    infoboxes_for_spotting.append(result)

# POS-Tagging

In [12]:
def get_offset_of_tags_in_sentence(sentence, tags):
    # go from left to right and determine tag offsets
    offsets = []
    total_offset = 0
    for tag in tags:
        label = tag[0]
        local_offset = sentence.find(label)
        offset = total_offset + local_offset
        offsets.append(offset)

        # prepare for next iteration
        sentence = sentence[local_offset + len(label):]
        total_offset = offset + len(label)
    return offsets


def get_pos_tags_of_sentence(sentence):
    tags = []
    for token in sentence.tokens:
        for word in token.words:
            tags.append((word.text, word.pos))
    return tags


def calculate_pos_tags_for_string(doc):
    tags = []
    for sentence in doc.sentences:
        sentence_pos = []
        for pos in get_pos_tags_of_sentence(sentence):
            sentence_pos.append(pos)
        tags.append(sentence_pos)
    return tags


def pos_tagging(sentence):
    doc = stanford_nlp(sentence)
    tags = calculate_pos_tags_for_string(doc)[0]
    offsets = get_offset_of_tags_in_sentence(sentence, tags)
    return [(tags[x][0], tags[x][1], str(offsets[x])) for x in range(len(tags))]

In [13]:
for sample in infoboxes_for_spotting:
    sample['value:POS'] = pos_tagging(sample['value'])

# Infobox Spotter: Prediction

In [15]:
def get_index(chunk):
    return [int(chunk.tokens[0].get_tag('idx').value),
            int(chunk.tokens[-1].get_tag('idx').value)
            + len(chunk.tokens[-1].text)]

In [4]:
classifier = SequenceTagger.load(PATH_FLAIR_FOLDER + 'final-model.pt')

2020-10-16 11:43:29,318 loading file ../../data/flair-models/infoboxes/final-model.pt


In [17]:
for sample in infoboxes_for_spotting:
    sentence = Sentence(use_tokenizer=False)

    for pos in sample['value:POS']:
        token = Token(pos[0])
        token.add_tag('POS', pos[1])
        token.add_tag('idx', pos[2])
        sentence.add_token(token)

    classifier.predict(sentence)
    chunks = [get_index(chunk) for chunk in sentence.get_spans('chunk_BIO')]

    extraction = []
    for chunk in chunks:
        extraction.append(sample['value'][chunk[0]:chunk[1]])

    sample['extraction'] = extraction

# Post-processing

In [18]:
def get_relation(page_title, infobox_title,
                 infobox_type, infobox_argument, value):
    if infobox_title != 'None':
        subject = infobox_title
    else:
        subject = page_title

    if infobox_argument == 'symptoms':
        return (subject, value)

    if infobox_argument in ['cause', 'causes']:
        return (value, subject)

    if infobox_argument == 'risks':
        if infobox_type.lower() in ['infobox medical condition (new)']:
            return (value, subject)
        elif infobox_type.lower() in ['infobox birth control']:
            return (subject, value)

    raise Exception("Not handled.")

In [19]:
def get_offset_of_tags_in_sentence(sentence, tags):
    # go from left to right and determine tag offsets
    offsets = []
    total_offset = 0
    for tag in tags:
        label = tag[0]
        local_offset = sentence.find(label)
        offset = total_offset + local_offset
        offsets.append(offset)

        # prepare for next iteration
        sentence = sentence[local_offset + len(label):]
        total_offset = offset + len(label)

    return offsets


def post_process_value(value):
    tags = pos_tag(word_tokenize(value))
    offset = get_offset_of_tags_in_sentence(value, tags)

    if len(tags) == 1:
        return value

    left = 0
    right = len(tags)-1

    for tag in tags:
        if tag[1] in ['CC', 'DT']:
            left += 1
        else:
            break

    for tag in reversed(tags):
        if tag[1] in ['.', ',', ';', 'CC']:
            right -= 1
        else:
            break

    return value[offset[left]:offset[right] + len(tags[right][0])]

In [20]:
infobox_graph = []

for sample in infoboxes_for_spotting:
    for value in sample['extraction']:
        value = post_process_value(value)

        if len(value) == 0:
            continue

        source = sample['sources'][0]['payload']
        relation = get_relation(source['wikipedia_page_title'],
                                source['infobox_title'],
                                source['infobox_template'],
                                source['infobox_argument'],
                                value)

        causal_relation = {'causal_relation': {
            'cause': {'concept': relation[0]},
            'effect': {'concept': relation[1]},
        }, 'sources': sample['sources']}
        infobox_graph.append(causal_relation)

# Save Infobox-graph

In [None]:
jsonarray = json.dumps(infobox_graph)
file_list_graph = open(PATH_OUTPUT_GRAPH, "w+")
file_list_graph.write(jsonarray)
file_list_graph.close()