# Preamble

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from datetime import datetime
from json import JSONDecodeError
import json
import bz2
import gzip
import io

In [2]:
PATH_CAUSENET = "../../data/causality-graphs/causenet-full.jsonl.bz2"
PATH_FREEBASE = "../../data/external/knowledge-bases/freebase-rdf-latest.gz"
PATH_CONCEPTNET = "../../data/external/knowledge-bases/conceptnet-assertions-5.6.0.tsv"
PATH_WIKIDATA = "../../data/external/knowledge-bases/wikidata-20181001-all.json.bz2"

# Table 1: Overview of causal relations in knowledge bases

## CauseNet

In [3]:
def load_jsonl(path):
    print("Loading... " + path)
    lines = []
    document = bz2.open(path, mode='rt')
    for line in document:
        lines.append(json.loads(line))
    return lines

In [4]:
def belongs_to_high_precision_causenet(sample):
    if sample['support'] > 1:
        return True

    for source in sample['sources']:
        if source['type'] == 'wikipedia_infobox':
            return True

        if source['type'] == 'wikipedia_list':
            return True

    return False

In [5]:
def print_statistics(causality_graph):
    nodes = []
    for sample in causality_graph:
        nodes.append(sample['causal_relation']['cause']['concept'])
        nodes.append(sample['causal_relation']['effect']['concept'])

    print(f'Relations: {len(causality_graph):,}')
    print(f'Concepts: {len(set(nodes)):,}')

In [None]:
causenet = load_jsonl(PATH_CAUSENET)

Loading... ../../data/causality-graphs/causenet-full.jsonl.bz2


In [7]:
for relation in causenet:
    patterns = []
    for source in relation['sources']:
        if 'path_pattern' in source['payload']:
            patterns.append(source['payload']['path_pattern'])
    relation['support'] = len(set(patterns))

In [8]:
causenet_precision = []

for sample in causenet:
    if belongs_to_high_precision_causenet(sample):
        causenet_precision.append(sample)

In [9]:
print("CauseNet:")
print_statistics(causenet)
print()
print("CauseNet-Precision:")
print_statistics(causenet_precision)

CauseNet:
Relations: 11,609,890
Concepts: 12,186,310

CauseNet-Precision:
Relations: 197,806
Concepts: 80,223


## Freebase

In [3]:
freebase_causal_properties = [
    'medicine.disease.symptoms>',
    'medicine.symptom.symptom_of>',
    'medicine.disease.risk_factors>',
    'medicine.risk_factor.diseases>',
    'medicine.disease.causes>',
    'medicine.disease_cause.diseases>',
    'medicine.drug.physiologic_effect>',
    'medicine.drug_physiologic_effect.drugs_with_this_physiologic_effect>',
    'base.pethealth.symptom.symptom_of>',
    'base.pethealth.pet_disease_or_medical_condition.symptoms>',
    'medicine.symptom.side_effect_of>',
    'medicine.medical_treatment.side_effects>',
    'base.wordnet.synset.causes>',
    'base.wordnet.synset.caused_by>',
    'base.pethealth.pet_disease_risk_factor.' +
    'pet_diseases_with_this_risk_factor>',
    'base.pethealth.pet_disease_or_medical_condition.risk_factors>',
    'base.pethealth.cause.pet_diseases_or_conditions_caused>',
    'base.horsefacts.coat_locus_effect.coat_colors>',
    'base.horsefacts.coat_color.causative_locus>',
    'base.pethealth.pet_disease_or_medical_condition.causes>',
    'base.disaster2.rail_accident.cause>',
    'base.disaster2.train_accident_cause.train_accidents_caused_this_way>',
    'biology.plant_disease_cause.plant_disease_triangle>',
    'biology.plant_disease_triangle.plant_disease_cause>',
    'base.disaster2.injury_causing_event.injury>',
    'base.disaster2.injury.caused_by_event>',
    'base.animalpathology.animal_disease_cause.animal_disease_triangle>',
    'base.animalpathology.animal_disease_triangle.animal_disease_cause>',
    'base.fires.explosion.cause>',
    'base.fires.explosion_cause.explosion>',
    'base.horsefacts.coat_locus.effect>',
    'base.horsefacts.coat_locus_effect.locus>',
    'base.fires.fires.firecause>',
    'user.skud.fictional_diseases.fictional_disease.symptoms>',
    'base.fires.fire_cause.fires_caused_this_way>',
    'user.skud.fictional_diseases.fictional_symptom.symptom_of>',
    'user.lindajohnson.default_domain.side_effects.side_effect>',
    'base.qualia.disability.disability_causing_medical_condition>',
    'user.robert.earthquakes.earthquake_effect.earthquake>',

    'people.deceased_person.cause_of_death>',
    'people.cause_of_death.people>',
    'people.cause_of_death.includes_causes_of_death>',
    'base.disaster2.death_causing_event.person_killed>',
    'base.fictionaluniverse.deceased_fictional_character.cause_of_death>',
    'base.disaster2.type_of_injury_causing_event.injuries_caused_this_way>',
    'base.disaster2.shipwreck_event.cause>',
    'base.disaster2.shipwreck_cause.ships_wrecked_this_way>',
    'media_common.cause_of_loss.works_lost_this_way>',
    'base.damsbase.dam_failure.cause_of_failure>',
    'user.teeler.default_domain.death_euphemism.related_causes>'
]

prefix = "<http://rdf.freebase.com/ns/"

freebase_causal_properties = [prefix + p for p in freebase_causal_properties]

In [4]:
def load_freebase(causal_properties):
    gz = gzip.open(PATH_FREEBASE, 'rb')
    causal_relations = {}

    for line in io.BufferedReader(gz):
        line = line.decode("utf-8").strip()
        s, p, o, _ = line.split("\t")

        if p in causal_properties:
            causal_relations.setdefault(p, []).append(line)
    return causal_relations

In [5]:
def get_freebase_statistics(freebase_causality, causal_properties):
    causality_graph = []

    for causal_property in causal_properties:

        for relation in freebase_causality[causal_property]:
            relation = relation.split("\t")
            relation = (relation[0], causal_property, relation[2])

            causality_graph.append(relation)

    nodes = []

    for relation in causality_graph:
        nodes.append(relation[0])
        nodes.append(relation[1])

    print(f'Relations: {len(set([str(x) for x in causality_graph])):,}')
    print(f'Concepts: {len(set(nodes)):,}')

In [6]:
freebase_causality = load_freebase(freebase_causal_properties)

In [7]:
print("Freebase:")
get_freebase_statistics(freebase_causality, freebase_causal_properties)

Freebase:
Relations: 128,766
Concepts: 52,487


## ConceptNet

In [3]:
def load_conceptnet():
    conceptnet = open(PATH_CONCEPTNET).readlines()
    conceptnet_triples = []

    for row in conceptnet:
        elements = row.split("\t")
        triple = (elements[2], elements[1], elements[3])
        conceptnet_triples.append(triple)

    return conceptnet_triples

In [4]:
def count_nodes(relation_list):
    nodes = []
    for relation in relation_list:
        nodes.append(relation[0])
        nodes.append(relation[2])
    return len(set(nodes))

In [5]:
conceptnet = load_conceptnet()

In [6]:
en_conceptnet = [t for t in conceptnet if '/en/' in t[0] and '/en/' in t[2]]
en_conceptnet = set([str(t) for t in en_conceptnet])

In [7]:
causal_properties = ['/r/CausesDesire', '/r/Causes']
causal_triples = set([t for t in conceptnet if t[1] in causal_properties])
en_causal_triples = set([t for t in causal_triples if str(t) in en_conceptnet])

In [8]:
print("ConceptNet Multilingual:")
print("Relations: " + f'{len(causal_triples):,}')
print("Concepts: " + f'{count_nodes(causal_triples):,}')
print()
print("ConceptNet English:")
print("Relations: " + f'{len(en_causal_triples):,}')
print("Concepts: " + f'{count_nodes(en_causal_triples):,}')

ConceptNet Multilingual:
Relations: 114,308
Concepts: 57,561

ConceptNet English:
Relations: 21,485
Concepts: 16,432


## Wikidata

In [3]:
wikidata_causal_predicates = [
    'P509',  # cause of death
    'P780',  # symptoms
    'P828',  # has cause
    'P1542',  # has effect
    'P770',  # cause of destruction
    'P1478',  # has immediate cause
    'P1479',  # has contributing factor
    'P1534',  # end cause
]

In [4]:
def load_wikidata_causality(wikidata_causal_predicates):
    causal_wikidata = []

    for line in bz2.open(PATH_WIKIDATA, mode='rt'):
        try:
            item = json.loads(line.strip()[:-1])
        except JSONDecodeError:
            continue

        for wikidata_property in item['claims'].keys():
            if wikidata_property in wikidata_causal_predicates:
                for snack in item['claims'][wikidata_property]:
                    if 'datavalue' not in snack['mainsnak']:
                        continue
                    value = snack['mainsnak']['datavalue']['value']
                    if 'id' not in value:
                        continue
                    wikidata_object = value['id']
                    relation = (item['id'], wikidata_property, wikidata_object)
                    causal_wikidata.append(relation)
    return causal_wikidata

In [None]:
wikidata_causality = load_wikidata_causality(wikidata_causal_predicates)

In [8]:
wikidata_cause_of_death = [relation 
                           for relation in wikidata_causality
                           if relation[1] == 'P509']

In [7]:
nodes = []
for relation in wikidata_causality:
    nodes.append(relation[0])
    nodes.append(relation[2])

In [8]:
print("Wikidata:")
print(f'Relations: {len(set(wikidata_causality)):,}')
print(f'Concepts: {len(set(nodes)):,}')

Wikidata:
Relations: 95,335
Concepts: 88,233


In [16]:
cause_of_death = len(set(wikidata_cause_of_death))
cause_of_death /= len(set(wikidata_causality))
cause_of_death = round(cause_of_death,3)
print("Percentage of cause of death relations:")
print(f"{cause_of_death}")

Percentage of cause of death relations:
0.847


## DBpedia Live

In [31]:
dbpedia_live = SPARQLWrapper("http://live.dbpedia.org/sparql")

In [32]:
def send_query(endpoint, query):
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    results = endpoint.query().convert()
    return results

In [33]:
def full_graph_query(endpoint, predicates):
    all_relations = []
    all_nodes = []

    for predicate in predicates:
        query = """
        SELECT ?s ?o WHERE { ?s <""" + predicate + """> ?o}
        """
        results = send_query(endpoint, query)

        for result in results['results']['bindings']:
            relation_subject = result['s']['value']
            relation_object = result['o']['value']
            all_relations.append((relation_subject, predicate, relation_object))
            all_nodes.append(relation_subject)
            all_nodes.append(relation_object)

    return all_relations, all_nodes

In [34]:
# defined by systematically searching DBpedia properties
causal_predicates = [
    "http://dbpedia.org/property/cause",
    "http://dbpedia.org/property/causes",
    "http://dbpedia.org/ontology/deathCause",
    "http://dbpedia.org/ontology/medicalCause",
    "http://dbpedia.org/property/causeOfDeath",
    "http://dbpedia.org/property/causalAgents",
    "http://dbpedia.org/property/causeDeath",
    "http://dbpedia.org/property/causeofdeath",
    "http://dbpedia.org/property/effects",
    "http://dbpedia.org/ontology/symptom",
]

relations, nodes = full_graph_query(dbpedia_live, causal_predicates)

In [35]:
cause_of_death_predicates = [causal_predicates[i] 
                             for i in [2,4,6,7]]
cause_of_death_relations = [relation
                            for relation in relations
                            if relation[1] in cause_of_death_predicates]

In [36]:
print(f"DBpedia Live ({datetime.now()}):")
print("Relations: " + f'{len(set(relations)):,}')
print("Concepts: " + f'{len(set(nodes)):,}')

DBpedia Live (2020-08-15 07:48:19.913180):
Relations: 8,025
Concepts: 7,691


In [37]:
len(set(cause_of_death_relations))
cause_of_death = len(set(cause_of_death_relations))
cause_of_death /= len(set(relations))
cause_of_death = round(cause_of_death,3)
print("Percentage of cause of death relations:")
print(f"{cause_of_death}")

Percentage of cause of death relations:
0.524
