# Preamble

In [1]:
from IPython.display import HTML, display
from urllib.parse import urlparse
import operator
import tldextract
import numpy as np
import pandas as pd
import tabulate
import bz2
import json

In [2]:
PATH_CAUSENET = "../../data/causality-graphs/causenet-full.jsonl.bz2"
PATH_CATEGORIZATION = "../../data/categorization/manual_categorization.csv"
PATH_WIKIPEDIA_EXTRACTION = "../../data/causality-graphs/extraction/"
PATH_WIKIPEDIA_EXTRACTION += "wikipedia/wikipedia-extraction.tsv"

# Loading CauseNet

In [3]:
def load_jsonl(path):
    print("Loading... " + path)
    lines = []
    document = bz2.open(path, mode='rt')
    for line in document:
        lines.append(json.loads(line))
    return lines

In [None]:
causenet = load_jsonl(PATH_CAUSENET)

Loading... ../../data/causality-graphs/causenet-full.jsonl.bz2


## Determine Support

In [5]:
for relation in causenet:
    patterns = []
    for source in relation['sources']:
        if 'path_pattern' in source['payload']:
            patterns.append(source['payload']['path_pattern'])
    relation['support'] = len(set(patterns))

## Determine Subgraphs

In [6]:
def get_subgraph(graph, source_type):
    subgraph = []
    for relation in graph:
        belongs_to_subgraph = False
        for source in relation['sources']:
            if source['type'] == source_type:
                belongs_to_subgraph = True
                break
        if belongs_to_subgraph:
            subgraph.append(relation)
    return subgraph

In [7]:
source_types = ['clueweb12_sentence', 'wikipedia_sentence',
                'wikipedia_list', 'wikipedia_infobox']
causality_graphs = {'causenet': causenet}

for source_type in source_types:
    causality_graphs[source_type] = get_subgraph(causenet, source_type)

# Table 2: Top 10 patterns of causal relations

In [8]:
pattern_statistics = {}

for relation in causality_graphs['causenet']:
    for source in relation['sources']:
        if 'path_pattern' in source['payload']:
            pattern = source['payload']['path_pattern']
            pattern_statistics.setdefault(pattern, []).append(
                str(relation['causal_relation']))

sorted_pattern_statistics = sorted(
    pattern_statistics.items(), key=lambda x: len(set(x[1])), reverse=True)

In [9]:
def pattern_statistics(sorted_pattern_statistics, output='html_table'):
    table = []
    for pattern_statistic in sorted_pattern_statistics:
        pattern = [x.replace("[", "").replace("]", "")
                   for x in pattern_statistic[0].split("\t")]

        if len(pattern) == 5 and len(sorted_pattern_statistics) > 10:
            pattern.append("")
            pattern.append("")

        number_relations = len(set(pattern_statistic[1]))
        pattern.append("{:,}".format(number_relations))
        table.append(pattern)

    if output == 'html_table':
        display(HTML(tabulate.tabulate(table, tablefmt='html')))
    else:
        # print raw latex
        for row in table:
            print('\t& '.join(row) + " \\\\")

In [10]:
pattern_statistics(sorted_pattern_statistics[:10])

0,1,2,3,4,5
cause/N,-nsubj,cause/VB,+dobj,effect/N,904385
cause/N,-nmod:with,associated/VBN,-acl,effect/N,892908
cause/N,-nsubj,lead/VB,+nmod:to,effect/N,783860
cause/N,-nsubj,led/VBD,+nmod:to,effect/N,724978
cause/N,-nsubjpass,associated/VBN,+nmod:with,effect/N,692666
cause/N,-nmod:by,caused/VBN,-acl,effect/N,598639
cause/N,-nsubj,result/VB,+nmod:in,effect/N,552352
cause/N,-nsubj,causes/VBZ,+dobj,effect/N,496426
cause/N,-nsubj,leads/VBZ,+nmod:to,effect/N,491340
cause/N,-nsubj,resulted/VBD,+nmod:in,effect/N,473298


# Table 3: Overview of CauseNet, Overlap and Sources 

## Overview

In [11]:
columns = ['CauseNet', '|V|', '|E|']
df_overview = pd.DataFrame(columns=columns).set_index('CauseNet')

for causality_graph in causality_graphs:

    nodes = []
    for sample in causality_graphs[causality_graph]:
        nodes.append(sample['causal_relation']['cause']['concept'])
        nodes.append(sample['causal_relation']['effect']['concept'])

    row = [len(set(nodes)), len(causality_graphs[causality_graph])]
    df_overview.loc[causality_graph] = row

In [12]:
df_overview.style.format("{0:,}")

Unnamed: 0_level_0,|V|,|E|
CauseNet,Unnamed: 1_level_1,Unnamed: 2_level_1
causenet,12186310,11609890
clueweb12_sentence,11368371,10872313
wikipedia_sentence,1070686,793593
wikipedia_list,8295,10612
wikipedia_infobox,7201,7880


## Intersections

In [13]:
intersections = {}
for causality_graph in causality_graphs:
    if causality_graph == 'causenet':
        continue

    graph_intersections = {}

    for relation in causality_graphs[causality_graph]:
        relation_intersections = []
        for source in relation['sources']:
            if source['type'] == causality_graph:
                continue

            if source['type'] in relation_intersections:
                continue

            relation_intersections.append(source['type'])

        for source_type in relation_intersections:
            intersection = graph_intersections.get(source_type, 0) + 1
            graph_intersections[source_type] = intersection

    intersections[causality_graph] = graph_intersections

In [14]:
columns = ['E1', 'E2', r'|E1$\cap$E2|']
df_intersections = pd.DataFrame(columns=columns).set_index(['E1', 'E2'])

for e1 in intersections:
    for e2 in intersections[e1]:
        if (e2, e1) in df_intersections.index:
            continue
        df_intersections.loc[(e1, e2), r'|E1$\cap$E2|'] = intersections[e1][e2]

df_intersections = df_intersections.astype({r'|E1$\cap$E2|': int})

In [15]:
df_intersections.sort_values(
    by=r'|E1$\cap$E2|', ascending=False).style.format("{0:,}")

Unnamed: 0_level_0,Unnamed: 1_level_0,|E1$\cap$E2|
E1,E2,Unnamed: 2_level_1
clueweb12_sentence,wikipedia_sentence,72937
clueweb12_sentence,wikipedia_infobox,939
clueweb12_sentence,wikipedia_list,506
wikipedia_sentence,wikipedia_infobox,209
wikipedia_list,wikipedia_infobox,93
wikipedia_sentence,wikipedia_list,70


## Sources

In [16]:
source_counter = {}
for relation in causality_graphs['causenet']:
    for source in relation['sources']:
        for source_type in source_types:
            if source_type == source['type']:
                if 'wiki' in source_type:
                    source_counter.setdefault(source_type, []).append(
                        source['payload']['wikipedia_page_id'])
                else:
                    source_counter.setdefault(source_type, []).append(
                        source['payload']['clueweb12_page_reference'])

In [17]:
for source_type in source_counter:
    source_counter[source_type] = len(set(source_counter[source_type]))

In [18]:
columns = ['Source', 'Pages/Articles']
df_sources = pd.DataFrame(columns=columns).set_index(['Source'])

for source in source_counter:
    df_sources.loc[source] = source_counter[source]

In [19]:
df_sources.sort_values(by='Pages/Articles',
                       ascending=False).style.format("{0:,}")

Unnamed: 0_level_0,Pages/Articles
Source,Unnamed: 1_level_1
clueweb12_sentence,12111758
wikipedia_sentence,427893
wikipedia_infobox,2725
wikipedia_list,1194


In [20]:
def is_valid_article(title):
    forbidden_title_parts = ['Wikipedia:', 'Template:', 'File:',
                             'Portal:', 'Category:', 'Draft:',
                             'List of', 'disambiguation']

    contains_forbidden_title_part = False
    for forbidden_title_part in forbidden_title_parts:
        if forbidden_title_part in title:
            contains_forbidden_title_part = True
            break
    return not contains_forbidden_title_part

In [21]:
wikipedia_number_articles = 0

for line in open(PATH_WIKIPEDIA_EXTRACTION, encoding="utf-8"):
    parts = line.strip().split('\t')
    if parts[0] != 'wikipedia_page':
        continue
    if len(parts) != 2:
        continue
    if not is_valid_article(parts[1]):
        continue
    wikipedia_number_articles += 1

In [22]:
print(f"Wikipedia total number of articles:")
print(f"{wikipedia_number_articles:,}")

Wikipedia total number of articles:
5,208,098


# Table 4: Source analysis

## ClueWeb

In [23]:
hostname_to_relations = {}
domains_to_relations = {}
tld_to_relations = {}
domains_to_subdomains = {}

for relation in causality_graphs['clueweb12_sentence']:
    for source in relation['sources']:
        if source['type'] != 'clueweb12_sentence':
            continue
        url = source['payload']['clueweb12_page_reference']
        hostname = '{uri.netloc}'.format(uri=urlparse(url))

        processed_url = tldextract.extract(url)
        subdomain = processed_url.subdomain
        domain = processed_url.registered_domain
        tld = processed_url.suffix

        relation_key = str(relation['causal_relation'])
        hostname_to_relations.setdefault(hostname, []).append(relation_key)
        domains_to_subdomains.setdefault(domain, []).append(subdomain)
        domains_to_relations.setdefault(domain, []).append(relation_key)
        tld_to_relations.setdefault(tld, []).append(relation_key)

In [24]:
sources = [hostname_to_relations, domains_to_subdomains,
           domains_to_relations, tld_to_relations]
for source_dict in sources:
    for key in source_dict:
        source_dict[key] = len(set(source_dict[key]))

In [25]:
sorted_hostname_to_relations = sorted(
    hostname_to_relations.items(), key=lambda x: x[1], reverse=True)
sorted_domains_to_relations = sorted(
    domains_to_relations.items(), key=lambda x: x[1], reverse=True)
sorted_tld_to_relations = sorted(
    tld_to_relations.items(), key=lambda x: x[1], reverse=True)

In [26]:
# manually defined categories
columns = ['Hostname', 'Category', '|E|']
df_sources = pd.DataFrame(columns=columns).set_index(['Hostname'])
categories = ["Science", "Science", "Science", "Regional",
              "Science", "Science", "Reference", "Reference",
              "Reference", "Science", "Business", "News",
              "News", "Computers", "Reference"]
for i in range(15):
    short_name = str(sorted_hostname_to_relations[i][0]).replace('www.', '')
    df_sources.loc[short_name] = [
        categories[i], sorted_hostname_to_relations[i][1]]

In [27]:
df_sources.style.format({'|E|': "{0:,}"})

Unnamed: 0_level_0,Category,|E|
Hostname,Unnamed: 1_level_1,Unnamed: 2_level_1
sdbonline.org,Science,26517
bionewsonline.com,Science,25212
jci.org,Science,16081
sec.gov,Regional,13907
plosone.org,Science,12722
molvis.org,Science,9544
neurotransmitter.net,Reference,8842
diseaseinformation.info,Reference,8829
leninist.biz,Reference,8033
lansbury.bwh.harvard.edu,Science,7828


In [28]:
print("Total hostnames: " + str(f'{len(hostname_to_relations):,}'))
mean_relations = sum(hostname_to_relations.values())
mean_relations = str(round(mean_relations/len(hostname_to_relations)))
print("Average relations: " + mean_relations)

Total hostnames: 842,698
Average relations: 18


In [29]:
columns = ['Hostname', 'Category', '|Subdomains|', '|E|']
df_sources = pd.DataFrame(columns=columns).set_index(['Hostname'])
# manually defined categories
categories = ["Science", "Society", "Society", "Society",
              "Regional", "Arts", "Reference", "Society",
              "Science", "Science", "Reference", "Regional",
              "Science", "Regional", "Reference"]
for i in range(15):
    df_sources.loc[sorted_domains_to_relations[i][0]] = [
        categories[i],
        domains_to_subdomains[sorted_domains_to_relations[i][0]],
        sorted_domains_to_relations[i][1],
    ]

In [30]:
df_sources.style.format({'|E|': "{0:,}", "|Subdomains|": "{0:,}"})

Unnamed: 0_level_0,Category,|Subdomains|,|E|
Hostname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
researchtoday.net,Science,302,125728
wordpress.com,Society,6835,91230
typepad.com,Society,5687,72357
hubpages.com,Society,4370,40473
nih.gov,Regional,368,40280
deviantart.com,Arts,20365,40064
about.com,Reference,828,36363
tripod.com,Society,1877,31131
sdbonline.org,Science,1,26517
bionewsonline.com,Science,1,25212


In [31]:
print("Total hostnames: " + str(f'{len(domains_to_relations):,}'))
mean_relations = sum(domains_to_relations.values())
mean_relations = str(round(mean_relations/len(domains_to_relations)))
print("Average relations: " + mean_relations)

Total hostnames: 635,861
Average relations: 22


In [32]:
df_sources = pd.DataFrame(columns=['TLD', '|E|']).set_index(['TLD'])
for tld in sorted_tld_to_relations[:15]:
    df_sources.loc[tld[0]] = tld[1]

In [33]:
df_sources.style.format({'|E|': "{0:,}"})

Unnamed: 0_level_0,|E|
TLD,Unnamed: 1_level_1
com,5597297
org,2590683
net,793937
edu,766731
gov,320263
co.uk,229834
ca,185661
info,138519
org.uk,111697
ac.uk,85304


In [34]:
print("Total TLDs: " + str(f'{len(tld_to_relations):,}'))
mean_relations = round(sum(tld_to_relations.values())/len(tld_to_relations))
print(f"Average relations: {mean_relations:,}")

Total TLDs: 1,181
Average relations: 10,206


### Manual Categorization

In [35]:
categorization = pd.read_csv(PATH_CATEGORIZATION,
                             header=None, index_col=0,
                             names=["Hostname", "|E|", "Category"])
categorization.head().style.format({'|E|': "{0:,}"})

Unnamed: 0_level_0,|E|,Category
Hostname,Unnamed: 1_level_1,Unnamed: 2_level_1
sdbonline.org,26517,Science
bionewsonline.com,25212,Science
jci.org,16081,Science
sec.gov,13907,Regional
plosone.org,12722,Science


In [36]:
def get_domains_of_category(category):
    row_indices = categorization["Category"] == category
    return categorization[row_indices].index.values.tolist()

In [37]:
categories = list(set(categorization["Category"].values.tolist()))
categories += ['Games', 'Home', 'Recreation', 'Sports']

In [38]:
url_to_graph = {}

for relation in causality_graphs['clueweb12_sentence']:
    for source in relation['sources']:
        if source['type'] != 'clueweb12_sentence':
            continue

        url = source['payload']['clueweb12_page_reference']
        hostname = '{uri.netloc}'.format(uri=urlparse(url))
        hostname = hostname.replace('www.', '')

        for annotated_hostnames in categorization.index.values.tolist():
            if hostname == annotated_hostnames:
                relation_idx = str(relation['causal_relation'])
                url_to_graph.setdefault(hostname, []).append(relation_idx)

In [39]:
category_to_relations = {}
for category in categories:
    category_to_relations.setdefault(category, [])
    for domain in get_domains_of_category(category):
        if domain in url_to_graph:
            relations = url_to_graph[domain]
            category_to_relations.setdefault(category, []).extend(relations)

In [40]:
for category in category_to_relations:
    category_to_relations[category] = len(set(category_to_relations[category]))

In [41]:
columns = ['Category', '|Domains|', '|E|']
df_sources = pd.DataFrame(columns=columns).set_index(['Category'])

for category in category_to_relations:
    df_sources.loc[category] = [len(get_domains_of_category(category)),
                                category_to_relations[category]]

In [42]:
df_sources.sort_values('|E|', ascending=False).style.format({'|E|': "{0:,}"})

Unnamed: 0_level_0,|Domains|,|E|
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Science,121,296330
Reference,118,240033
Health,84,147851
Society,80,129058
Regional,34,76754
Buisness,21,43900
News,11,33906
Computers,18,27319
Shopping,9,14078
Arts,3,8017


## Wikipedia

In [43]:
template_sources = {}
template_relations = {}
template_values = {}

for relation in causality_graphs["wikipedia_infobox"]:
    for source in relation['sources']:
        if source['type'] != "wikipedia_infobox":
            continue
        page_id = source['payload']['wikipedia_page_id']
        infobox_template = source['payload']['infobox_template'].lower()

        template_sources.setdefault(infobox_template, []).append(page_id)
        relation_id = str(relation['causal_relation'])
        template_relations.setdefault(infobox_template, []).append(relation_id)

        ifbx_value_id = source['payload']['wikipedia_page_id']
        ifbx_value_id += source['payload']['infobox_argument']
        ifbx_value_id += source['payload']['infobox_title']
        template_values.setdefault(infobox_template, []).append(ifbx_value_id)

In [44]:
columns = ['Infobox template', 'Articles', 'Values', '|E|']
df_sources = pd.DataFrame(columns=columns).set_index(['Infobox template'])

for template in template_sources.keys():
    df_sources.loc[template] = [len(set(template_sources[template])),
                                len(set(template_values[template])),
                                len(set(template_relations[template]))]

In [45]:
df_sources.sort_values('|E|', ascending=False).style.format({'|E|': "{0:,}", 'Values': "{0:,}"})

Unnamed: 0_level_0,Articles,Values,|E|
Infobox template,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
infobox medical condition (new),820,1695,4923
infobox civil conflict,579,581,1339
infobox rail accident,452,461,530
infobox event,380,384,495
infobox wildfire,257,257,306
infobox news event,146,146,170
infobox oil spill,35,35,36
infobox military conflict,23,23,32
infobox birth control,13,13,26
infobox bus accident,20,20,23


In [46]:
def articles_to_relations(graph_type):
    relations_by_article = {}
    article_name_mapping = {}

    for relation in causality_graphs[graph_type]:
        for source in relation['sources']:
            if source['type'] != graph_type:
                continue
            article_id = int(source['payload']['wikipedia_page_id'])
            if article_id not in relations_by_article:
                relations_by_article.update({article_id: []})
            relation_id = str(relation['causal_relation'])
            relations_by_article[article_id].append(relation_id)
            article_name = {int(source['payload']['wikipedia_page_id']):
                            source['payload']['wikipedia_page_title']}
            article_name_mapping.update(article_name)

    print(f"Number Wikipedia Articles: {len(set(relations_by_article)):,}")

    for article in relations_by_article:
        relations_by_article[article] = set(relations_by_article[article])
    return relations_by_article, article_name_mapping

In [47]:
relations_by_article, name_mapping = articles_to_relations('wikipedia_infobox')

Number Wikipedia Articles: 2,725


In [48]:
def sort_key(key):
    return (-len(relations_by_article[key]), name_mapping[key])


columns = ['Wikipedia article title (ibxs.)', '|E|']
index = ['Wikipedia article title (ibxs.)']
df_sources = pd.DataFrame(columns=columns).set_index(index)

sorted_articles = sorted(relations_by_article, key=sort_key)
for idx in sorted_articles[:10]:
    df_sources.loc[name_mapping[idx]] = len(relations_by_article[idx])

In [49]:
df_sources

Unnamed: 0_level_0,|E|
Wikipedia article title (ibxs.),Unnamed: 1_level_1
2013 Romanian protests against the Roșia Montană Project,23
Shock (circulatory),19
Breast cancer,18
Constipation,17
Intracerebral hemorrhage,17
Protests against Donald Trump,17
Heat stroke,16
Scombroid food poisoning,16
Acute lymphoblastic leukemia,15
Bowel obstruction,15


In [50]:
relations_by_article, name_mapping = articles_to_relations('wikipedia_list')

Number Wikipedia Articles: 1,194


In [51]:
def sort_key(key):
    return (-len(relations_by_article[key]), name_mapping[key])


columns = ['Wikipedia article title (lists)', '|E|']
index = ['Wikipedia article title (lists)']
df_sources = pd.DataFrame(columns=columns).set_index(index)

sorted_articles = sorted(relations_by_article, key=sort_key)
for idx in sorted_articles[:10]:
    df_sources.loc[name_mapping[idx]] = len(relations_by_article[idx])

In [52]:
df_sources

Unnamed: 0_level_0,|E|
Wikipedia article title (lists),Unnamed: 1_level_1
Flushing (physiology),58
Mast cell activation syndrome,56
Coarse facial features,50
Hypotonia,47
Autistic catatonia,46
Livedo reticularis,46
Pallor,43
Delayed puberty,42
Eosinophilic myocarditis,42
Intraparenchymal hemorrhage,42


In [53]:
relations_by_article, name_mapping = articles_to_relations('wikipedia_sentence')

Number Wikipedia Articles: 427,893


In [54]:
def sort_key(key):
    return (-len(relations_by_article[key]), name_mapping[key])


columns = ['Wikipedia article title (texts)', '|E|']
index = ['Wikipedia article title (texts)']
df_sources = pd.DataFrame(columns=columns).set_index(index)

sorted_articles = sorted(relations_by_article, key=sort_key)
for idx in sorted_articles[:10]:
    df_sources.loc[name_mapping[idx]] = len(relations_by_article[idx])

In [55]:
df_sources

Unnamed: 0_level_0,|E|
Wikipedia article title (texts),Unnamed: 1_level_1
Effects of global warming on human health,98
Hepatitis,79
Horse colic,77
Safety of electronic cigarettes,72
Nutritional neuroscience,71
Causes of cancer pain,70
Dog health,69
Long-term effects of alcohol consumption,69
Famine,67
Progeroid syndromes,60


In [56]:
mean_relations = sum([len(x) for x in relations_by_article.values()])
mean_relations = round(mean_relations/len(relations_by_article))
print(mean_relations)

2


# Table 5: Concepts and paths of CauseNet

In [57]:
class Node():

    def __init__(self, concept):
        self.concept = concept
        self.outgoing = {}
        self.incoming = {}

    def add_incoming(self, concept, support):
        self.incoming[concept] = support

    def add_outgoing(self, concept, support):
        self.outgoing[concept] = support

    def degree_centrality(self, graph_size):
        return len(list(set(self.outgoing.keys()))
                   + list(set(self.incoming.keys())))/(graph_size-1)

In [58]:
class Graph():
    def __init__(self):
        self.concepts = {}
        self.sorted = None

    def add_relation(self, causal_relation, support):
        cause_concept = causal_relation['cause']['concept']
        effect_concept = causal_relation['effect']['concept']
        self.add_concepts(cause_concept, effect_concept)

        cause_node = self.concepts[causal_relation['cause']['concept']]
        effect_node = self.concepts[causal_relation['effect']['concept']]
        cause_node.add_outgoing(effect_concept, support)
        effect_node.add_incoming(cause_concept, support)

    def add_concepts(self, cause_concept, effect_concept):
        if cause_concept not in self.concepts:
            self.concepts[cause_concept] = Node(cause_concept)
        if effect_concept not in self.concepts:
            self.concepts[effect_concept] = Node(effect_concept)

    def _sort_by_centrality(self, node):
        return (-node[1].degree_centrality(len(self.concepts)),
                node[1].concept)

    def most_central_nodes(self, top_k=10):
        if self.sorted is not None:
            return self.sorted

        nodes = self.concepts.items()
        sorted_nodes = sorted(nodes, key=self._sort_by_centrality)
        self.sorted = sorted_nodes[:top_k]
        return self.sorted

## Central Nodes

In [59]:
source_type_to_graphs = {}

for source_type in source_types:
    graph = Graph()

    for relation in causality_graphs[source_type]:
        graph.add_relation(relation['causal_relation'], relation['support'])

    source_type_to_graphs[source_type] = graph

In [60]:
for source_type in source_types:
    print(source_type)
    columns = ['Concept', 'Out', 'In', 'Cent.']
    df_centrality = pd.DataFrame(columns=columns).set_index(['Concept'])
    graph = source_type_to_graphs[source_type]
    most_central_nodes = graph.most_central_nodes()

    for node in most_central_nodes:
        graph_size = len(source_type_to_graphs[source_type].concepts)
        centrality = node[1].degree_centrality(graph_size)
        df_centrality.loc[node[0]] = [len(node[1].outgoing),
                                      len(node[1].incoming),
                                      centrality]

    df_centrality = df_centrality.astype({'Out': int, 'In': int})
    dataframe_format = {'Out': "{:,}", 'In': "{:,}", 'Cent.':"{:,.3f}"}
    display(HTML(df_centrality.style.format(dataframe_format).render()))

    print()

clueweb12_sentence


Unnamed: 0_level_0,Out,In,Cent.
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
problems,8077,64355,0.006
death,4485,44144,0.004
damage,3890,28301,0.003
pain,3668,23046,0.002
disease,11198,15175,0.002
injury,6681,14733,0.002
stress,10114,9077,0.002
changes,9155,9459,0.002
problem,2608,15975,0.002
symptoms,2720,14415,0.002



wikipedia_sentence


Unnamed: 0_level_0,Out,In,Cent.
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
death,689,4054,0.004
problems,412,2519,0.003
damage,340,1953,0.002
controversy,427,1729,0.002
disease,831,1019,0.002
events,1625,220,0.002
accident,778,1064,0.002
incident,1465,309,0.002
deaths,113,1361,0.001
success,811,632,0.001



wikipedia_list


Unnamed: 0_level_0,Out,In,Cent.
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fatigue,3,66,0.008
nausea,0,68,0.008
vomiting,1,59,0.007
flushing_(_physiology_),0,58,0.007
mast_cell_activation_syndrome,56,0,0.007
fever,5,50,0.007
hypotonia,1,53,0.007
tachycardia,4,50,0.007
coarse_facial_features,0,50,0.006
anxiety,35,14,0.006



wikipedia_infobox


Unnamed: 0_level_0,Out,In,Cent.
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
unknown,117,0,0.016
fever,4,103,0.015
lightning,80,0,0.011
family_history,73,0,0.01
under_investigation,68,0,0.009
vomiting,1,56,0.008
obesity,49,3,0.007
shortness_of_breath,0,52,0.007
arson,47,0,0.007
smoking,46,0,0.006





## Paths

In [61]:
def dfs(graph, node, support, length):
    if length == 0:
        return [[(node, support)]]

    resulting_paths = []

    for outgoing_edge in graph.concepts[node].outgoing.items():

        target = outgoing_edge[0]
        target_support = outgoing_edge[1]

        if target_support <= 10:
            # prune for efficiency
            continue

        for path in dfs(graph, target, target_support, length-1):
            resulting_paths.append([(node, support)] + path)

    return resulting_paths

In [62]:
def geometric_mean(numbers):
    return np.power(np.prod(numbers), 1.0/len(numbers))

In [63]:
def paths_with_highest_support(graph, path_lengths, min_path_support):
    path_dict = {}

    for node in graph.concepts:
        for path in dfs(graph, node, 0, path_lengths):
            path_string_list = [x[0] for x in path]
            key = ' -> '.join(path_string_list)
            edge_support_values = [x[1] for x in path if x[1] > 0]
            path_support = geometric_mean(edge_support_values)

            if path_support > min_path_support:
                # store only relevant paths
                path_dict.update({key: path_support})

    return sorted(path_dict.items(), key=lambda x: (-x[1], x[0]))

In [64]:
graph_causenet = Graph()

for relation in causenet:
    graph_causenet.add_relation(
        relation['causal_relation'], relation['support'])

In [65]:
paths_length_1 = paths_with_highest_support(graph_causenet, 1, 25)
paths_length_2 = paths_with_highest_support(graph_causenet, 2, 20)
paths_length_3 = paths_with_highest_support(graph_causenet, 3, 18)

In [66]:
def print_paths():
    for path_length in range(3):
        nodes = set()

        print("Path-length: " + str(path_length+1))

        columns = ['Cause']
        for i in range(path_length):
            columns.append(f'Mediator {i+1}')
        columns.append('Effect')
        columns.append('Support')

        df_paths = pd.DataFrame(columns=columns).set_index(columns[:-1])

        i = 0
        printed_results = 0
        while printed_results < 10:
            paths = [paths_length_1, paths_length_2, paths_length_3]
            path = paths[path_length][i][0]
            support = str(int(round(paths[path_length][i][1])))
            nodes_in_path = [x.strip() for x in path.split("->")]

            observed_nodes = sum([1 for node in nodes_in_path
                                  if node in nodes])
            nodes_already_observed = observed_nodes > 0
            i += 1

            if len(set(nodes_in_path)) < path_length + 2:
                # skip loops
                continue

            if nodes_already_observed:
                # only node-disjoint paths for better overview
                continue

            printed_results += 1

            df_paths.loc[tuple(nodes_in_path), 'Support'] = support
            nodes.update(nodes_in_path)
        display(HTML(df_paths.style.format({'Support': '{:}'}).render()))
        print()

In [67]:
print_paths()

Path-length: 1


Unnamed: 0_level_0,Unnamed: 1_level_0,Support
Cause,Effect,Unnamed: 2_level_1
accident,death,38
drought,famine,31
injury,pain,31
disease,deaths,30
smoking,lung_cancer,30
stress,illness,30
depression,suicide,28
anxiety,insomnia,27
bacteria,infection,27
diarrhea,dehydration,27



Path-length: 2


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Support
Cause,Mediator 1,Effect,Unnamed: 3_level_1
stress,illness,death,33
accident,injury,pain,31
exposure,disease,deaths,28
bacteria,infection,inflammation,26
obesity,diabetes,blindness,24
anxiety,depression,suicide,24
global_warming,drought,famine,24
diarrhea,dehydration,headaches,23
lightning,fire,damage,22
negligence,injuries,disability,21



Path-length: 3


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Support
Cause,Mediator 1,Mediator 2,Effect,Unnamed: 4_level_1
negligence,accident,injury,death,29
bacteria,infection,disease,deaths,27
inflammation,pain,depression,suicide,26
fear,stress,illness,disability,23
greenhouse_gases,global_warming,drought,famine,23
lack_of_exercise,obesity,diabetes,blindness,23
lightning,fire,damage,cancer,20
virus,diarrhea,dehydration,headaches,20
anemia,fatigue,accidents,injuries,19
alcohol,problems,anxiety,insomnia,19



