# Preamble

In [1]:
from flair.models import SequenceTagger
from flair.data import Sentence, Token
import stanfordnlp
import numpy as np
import json

In [2]:
PATH_CLUEWEB_EXTRACTION = "../../data/causality-graphs/extraction/"
PATH_CLUEWEB_EXTRACTION += "clueweb12/clueweb12-extraction.tsv"
PATH_FLAIR_FOLDER = "../../data/flair-models/sentences/"

PATH_STANFORD_RESOURCES = "../../data/external/stanfordnlp/"

PATH_OUTPUT_GRAPH = "../../data/causality-graphs/spotting/"
PATH_OUTPUT_GRAPH += "clueweb12/clueweb-graph.json"

In [3]:
stanfordnlp.download('en', PATH_STANFORD_RESOURCES)
stanford_nlp = stanfordnlp.Pipeline(processors='tokenize,pos',
                                    tokenize_pretokenized=True,
                                    models_dir=PATH_STANFORD_RESOURCES,
                                    treebank='en_ewt',
                                    use_gpu=True)

Using the default treebank "en_ewt" for language "en".
Use device: gpu
	Torch-GPU-ID: 0
---
Loading: tokenize
With settings: 
{'model_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'pretokenized': True, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '../../data/downloads/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


# Loading sentence data

In [4]:
sentence_set = []

for line in open(PATH_CLUEWEB_EXTRACTION, encoding="utf-8"):
    parts = line.strip().split('\t')
    if parts[0] != 'clueweb12_sentence':
        continue
    assert len(parts) == 8
    
    for match in json.loads(parts[7]):
        sentence_data = {
            "causal_relation": match,
            "sources": [{
                "type": "clueweb12_sentence",
                "payload": {
                    "clueweb12_page_id": parts[1],
                    "clueweb12_page_reference": parts[2],
                    "clueweb12_page_timestamp": parts[3],
                    "sentence": {
                        "surface": json.loads(parts[4]),
                        "tokens": json.loads(parts[5]),
                        "dependencies": json.loads(parts[6])
                        },
                    "path_pattern": match['Pattern']
                    }
                }
            ]
        }
        sentence_set.append(sentence_data)

# POS-Tagging

In [5]:
def get_offset_of_tags_in_sentence(sentence, tags):
    # go from left to right and determine tag offsets
    offsets = []
    total_offset = 0
    for tag in tags:
        label = tag[0]
        local_offset = sentence.find(label)
        offset = total_offset + local_offset
        offsets.append(offset)

        # prepare for next iteration
        sentence = sentence[local_offset + len(label):]
        total_offset = offset + len(label)
    return offsets


def get_pos_tags_of_sentence(sentence):
    tags = []
    for token in sentence.tokens:
        for word in token.words:
            tags.append((word.text, word.pos))
    return tags


def calculate_pos_tags_for_string(doc):
    tags = []
    for sentence in doc.sentences:
        sentence_pos = []
        for pos in get_pos_tags_of_sentence(sentence):
            sentence_pos.append(pos)
        tags.append(sentence_pos)
    return tags

In [6]:
def pos_tagging(sentences_to_predict):
    batch_of_tokens = [sample['sources'][0]['payload']['sentence']['tokens']
                       for sample in sentences_to_predict]
    strings = [' '.join(tokens) for tokens in batch_of_tokens]

    # batch processing is faster
    batch = '\n\n'.join(strings)
    doc = stanford_nlp(batch)
    tags = calculate_pos_tags_for_string(doc)

    assert len(tags) == len(sentences_to_predict)

    for i in range(len(sentences_to_predict)):
        sample = sentences_to_predict[i]
        sentence = sample['sources'][0]['payload']['sentence']['surface']
        offsets = get_offset_of_tags_in_sentence(sentence, tags[i])
        sample['sources'][0]['payload']['sentence']['POS'] = [
            (tags[i][x][0], tags[i][x][1], str(offsets[x]))
            for x in range(len(tags[i]))]

# Text-Spotter: Prediction

In [7]:
def prepare(batch):
    sentences = []

    for sample in batch:
        sentence = Sentence(use_tokenizer=False)

        tokens = sample['sources'][0]['payload']['sentence']['tokens']
        POS_tags = sample['sources'][0]['payload']['sentence']['POS']
        if len(tokens) > 200:
            # skipping sentences with too many tokens
            # due to GPU memory limitation
            continue

        for pos in POS_tags:
            token = Token(pos[0])
            token.add_tag('POS', pos[1])
            token.add_tag('idx', pos[2])
            sentence.add_token(token)

        sentences.append(sentence)
    return sentences

In [8]:
def predict(sentences, mini_batches):
    prediction = []
    classifier.predict(sentences, mini_batches)

    for i in range(len(sentences)):
        sentence = sentences[i]
        indices = [[token.idx-1 for token in chunk.tokens]
                   for chunk in sentence.get_spans('chunk_BIO')]

        extraction = []
        for index_list in indices:
            result = [sentence.tokens[index].text
                      for index in index_list]
            extraction.append(' '.join(result))

        prediction.append([extraction, indices])

    return prediction

In [9]:
def find_match(relation, indices):
    cause_index = int(relation['Cause'][1])
    effect_index = int(relation['Effect'][1])

    cause_match = None
    effect_match = None

    for index_range in indices:
        if cause_index in index_range:
            cause_match = indices.index(index_range)
        if effect_index in index_range:
            effect_match = indices.index(index_range)

        if (cause_match is not None
                and effect_match is not None
                and cause_match != effect_match):
            return [cause_match, effect_match]
    return []


def get_relations(batch, prediction):
    relations = []
    skipped_elements = 0

    for i in range(len(batch)):
        sample = batch[i]
        tokens = sample['sources'][0]['payload']['sentence']['tokens']
        POS_tags = sample['sources'][0]['payload']['sentence']['POS']
        if len(tokens) > 200:
            # skipping sentences with too many tokens
            # due to GPU memory limitation
            # see method prepare(batch)
            skipped_elements += 1
            continue

        path_pattern_extraction = sample['causal_relation']
        spotting_extraction, indices = prediction[i - skipped_elements]

        match = find_match(path_pattern_extraction, indices)
        if len(match) < 2:
            # In cases the tagger failed,
            # we disregarded the causal concepts
            continue
        cause_match, effect_match = match

        cause = spotting_extraction[cause_match]
        effect = spotting_extraction[effect_match]

        # concept POS (save for later post-processing)
        cause_pos_raw = [POS_tags[j] for j in indices[cause_match]]
        offset = get_offset_of_tags_in_sentence(cause, cause_pos_raw)
        cause_pos = [(cause_pos_raw[x][0],
                      cause_pos_raw[x][1],
                      str(offset[x]))
                     for x in range(len(cause_pos_raw))]

        effect_pos_raw = [POS_tags[j] for j in indices[effect_match]]
        offset = get_offset_of_tags_in_sentence(effect, effect_pos_raw)
        effect_pos = [(effect_pos_raw[x][0],
                       effect_pos_raw[x][1],
                       str(offset[x]))
                      for x in range(len(effect_pos_raw))]

        causal_relation = {'causal_relation': {
            'cause': {'concept': cause, 'POS': cause_pos},
            'effect': {'concept': effect, 'POS': effect_pos},
        }, 'sources': sample['sources']}
        relations.append(causal_relation)
    return relations

In [3]:
classifier = SequenceTagger.load(PATH_FLAIR_FOLDER + 'final-model.pt')

2020-10-16 11:42:33,845 loading file ../../data/flair-models/sentences/final-model.pt


In [11]:
text_graph = []

batch_size = 512 if len(sentence_set) > 512 else 32
batches = np.array_split(sentence_set, len(sentence_set)/batch_size)

for i in range(len(batches)):
    batch = batches[i]
    pos_tagging(batch)
    prepared_sentences = prepare(batch)
    prediction = predict(prepared_sentences, mini_batches=32)
    batch_relations = get_relations(batch, prediction)

    for relation in batch_relations:
        text_graph.append(relation)

# Postprocessing

In [12]:
def post_process(value, pos_tags):
    left = 0
    right = len(pos_tags)-1

    punctuation = ['.', ',', ';', '(', ')', '``', "''"]
    cutoff = ['CC', 'DT', 'PRP', 'PRP$'] + punctuation

    for tag in pos_tags:
        if tag[1] in cutoff:
            left += 1
        else:
            break

    for tag in reversed(pos_tags):
        if tag[1] in cutoff:
            right -= 1
        else:
            break

    return value[int(pos_tags[left][2]):int(pos_tags[right][2])
                 + len(pos_tags[right][0])]

In [13]:
for relation in text_graph:
    cause_concept = relation['causal_relation']['cause']['concept']
    cause_pos = relation['causal_relation']['cause']['POS']
    cause = post_process(cause_concept, cause_pos)

    effect_concept = relation['causal_relation']['effect']['concept']
    effect_pos = relation['causal_relation']['effect']['POS']
    effect = post_process(effect_concept, effect_pos)

    causal_relation = {
        'cause': {'concept': cause},
        'effect': {'concept': effect}
    }
    relation['causal_relation'] = causal_relation

    # further cleanup
    del relation['sources'][0]['payload']['sentence']['POS']

# Save Text-graph

In [14]:
jsonarray = json.dumps(text_graph)
file_list_graph = open(PATH_OUTPUT_GRAPH, "w+")
file_list_graph.write(jsonarray)
file_list_graph.close()