# Preamble

In [1]:
from nltk import word_tokenize
import unicodedata
import json
import nltk
import bz2

In [2]:
PATH_CAUSALITY_GRAPH_PARTS = [
    "../../data/causality-graphs/spotting/wikipedia/infobox-graph.json",
    "../../data/causality-graphs/spotting/wikipedia/list-graph.json",
    "../../data/causality-graphs/spotting/wikipedia/text-graph.json",
    "../../data/causality-graphs/spotting/clueweb12/clueweb-graph.json"
]

PATH_NLTK_RESOURCES = "../../data/external/nltk/"

PATH_OUTPUT_GRAPH = "../../data/causality-graphs/integration/causenet-full.jsonl.bz2"

In [3]:
nltk.download('punkt', PATH_NLTK_RESOURCES)
nltk.data.path.append(PATH_NLTK_RESOURCES)

[nltk_data] Downloading package punkt to
[nltk_data]     ../../data/downloads/nltk_data/...
[nltk_data]   Package punkt is already up-to-date!


# Load and Merge Causality Graphs

In [4]:
final_graph = []
for path in PATH_CAUSALITY_GRAPH_PARTS:
    final_graph.extend(json.loads(open(path).read()))

In [5]:
def normalize(string):
    # as in conceptNet
    # https://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
    return '_'.join([unicodedata.normalize('NFKC', token.lower())
                     for token in word_tokenize(string)])

In [6]:
for relation in final_graph:
    relation = relation['causal_relation']
    relation['cause']['concept'] = normalize(relation['cause']['concept'])
    relation['effect']['concept'] = normalize(relation['effect']['concept'])

In [7]:
existing = {}
position = -1

for relation in final_graph:
    position += 1
    relation_id = relation['causal_relation']['cause']['concept']
    relation_id += " -> "
    relation_id += relation['causal_relation']['effect']['concept']

    relation['can_delete'] = 0

    if relation_id in existing:
        first_relation = final_graph[existing[relation_id]]
        first_relation['sources'].extend(relation['sources'])

        relation['can_delete'] = 1  # marking for deletion
    else:
        existing[relation_id] = position

In [8]:
causenet = [relation for relation in final_graph
            if relation['can_delete'] == 0]

In [9]:
for relation in causenet:
    del relation['can_delete']

# Save CauseNet

In [10]:
target = bz2.open(PATH_OUTPUT_GRAPH, "wt")

for line in causenet:
    target.write(json.dumps(line))
    target.write("\n")

target.close()