In [1]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef
from collections import defaultdict 

In [2]:
def parse_graph(file):
    g = Graph()
    g.parse(file, format='turtle')
    
    return g

def create_namespace(graph, namespace, prefix):

    ns = Namespace(namespace)
    graph.namespace_manager.bind(prefix, namespace)
    
    return ns

### Complete Graph Stats

In [3]:
complete_g = parse_graph("./Parsing/outputs/complete_graph.ttl")
mesh_g = parse_graph("./mesh/mesh_graph.ttl")
hyp_g = parse_graph("./Parsing/outputs/hypothesis_main_graph.ttl")
bnode_g = parse_graph("./Parsing/outputs/bnode_graph.ttl")

# number of relations
complete_rels = len(list(complete_g.predicates())) 
print("number relations:", complete_rels)

# number mesh concepts
total_mesh = len(list(mesh_g.subjects())) 
print("total mesh concepts:", total_mesh)

#number paper/hypothesis/provenance nodes
hyp_namespace = create_namespace(complete_g, "http://example.org/hypothesis_ontology/", 'hyp')
total_paper = []
total_provenance = []
total_hypothesis = []

for s,p,o in complete_g.triples( (None, None, hyp_namespace.Paper) ):
    total_paper.append(s)
print("total paper concepts:", len(total_paper))

for s,p,o in complete_g.triples( (None, None, hyp_namespace.Provenance) ):
    total_provenance.append(s)
print("total provenance concepts:", len(total_provenance))

for s,p,o in complete_g.triples( (None, None, hyp_namespace.Hypothesis) ):
    total_hypothesis.append(s)
print("total hypothesis concepts:", len(total_hypothesis))

#number named entity nodes
prov_namespace = create_namespace(complete_g, 'http://www.w3.org/TR/prov-o/', 'prov')
total_ne = []
for s,p,o in complete_g.triples( (None, None, prov_namespace.Entity) ):
    total_ne.append(s)
print("total Named Entity concepts:", len(total_ne))

number relations: 127320
total mesh concepts: 12945
total paper concepts: 1222
total provenance concepts: 1073
total hypothesis concepts: 1221
total Named Entity concepts: 10000


In [4]:
complete_g = parse_graph("./Parsing/outputs/complete_graph.ttl")
mesh_g = parse_graph("./mesh/mesh_graph.ttl")
hyp_g = parse_graph("./Parsing/outputs/hypothesis_main_graph.ttl")
bnode_g = parse_graph("./Parsing/outputs/bnode_graph.ttl")

# number of relations
complete_rels = len(list(complete_g.predicates())) 
print("number relations:", complete_rels)

# number mesh concepts
total_mesh = len(list(mesh_g.subjects())) 
print("total mesh concepts:", total_mesh)

#number paper/hypothesis/provenance nodes
hyp_namespace = create_namespace(hyp_g, "http://example.org/hypothesis_ontology/", 'hyp')
total_paper = []
total_provenance = []
total_hypothesis = []

for s,p,o in hyp_g.triples( (None, None, hyp_namespace.Paper) ):
    total_paper.append(s)
print("total paper concepts:", len(total_paper))

for s,p,o in hyp_g.triples( (None, None, hyp_namespace.Provenance) ):
    total_provenance.append(s)
print("total provenance concepts:", len(total_provenance))

for s,p,o in hyp_g.triples( (None, None, hyp_namespace.Hypothesis) ):
    total_hypothesis.append(s)
print("total hypothesis concepts:", len(total_hypothesis))

#number named entity nodes
prov_namespace = create_namespace(bnode_g, 'http://www.w3.org/TR/prov-o/', 'prov')
total_ne = []
for s,p,o in bnode_g.triples( (None, None, prov_namespace.Entity) ):
    total_ne.append(s)
print("total Named Entity concepts:", len(total_ne))

number relations: 127320
total mesh concepts: 12945
total paper concepts: 1218
total provenance concepts: 1071
total hypothesis concepts: 1218
total Named Entity concepts: 10000


### Evaluation Links

In [5]:
kw_file = "./Parsing/outputs/hypothesis-keywords-graph.ttl"
ne_links_file = "./Linking/skos_same_abstract_contain.ttl"
mesh_links_file = "./Linking/mesh_links_contain.ttl"

In [8]:
kw_g = parse_graph(kw_file)
ne_links_g = parse_graph(ne_links_file)
mesh_links_g = parse_graph(mesh_links_file)

In [9]:
oa_namespace = create_namespace(kw_g, "http://www.w3.org/ns/oa#", 'oa')
skos_namespace = create_namespace(mesh_links_g, "http://www.w3.org/2004/02/skos/core#", 'skos')
skos_namespace = create_namespace(ne_links_g, "http://www.w3.org/2004/02/skos/core#", 'skos')

In [10]:
keywords = []
for s,p,o in kw_g.triples( (None, oa_namespace.hasTarget, None) ):
    keywords.append(s)
    
print("There are", len(keywords), "keywords")

There are 7421 keywords


In [11]:
mesh_file_keywords = []
for s,p,o in mesh_links_g.triples( (None, skos_namespace.related, None) ):
    mesh_file_keywords.append(s)
print("There are", len(set(mesh_file_keywords)), "keywords linked with a mesh concept")

There are 1167 keywords linked with a mesh concept


In [12]:
ne_file_keywords = []
for s,p,o in ne_links_g.triples( (None, skos_namespace.related, None) ):
    ne_file_keywords.append(s)
print("There are", len(set(ne_file_keywords)), "keywords linked with a named entity fishing concept")

There are 1413 keywords linked with a named entity fishing concept


In [13]:
# len(list(ne_g.predicates()))

In [14]:
ne_link_count = defaultdict(int)
mesh_link_count = defaultdict(int)

In [15]:
for s,p,o in ne_links_g.triples( (None, skos_namespace.related, None) ):
    ne_link_count[s] += 1

for s,p,o in mesh_links_g.triples( (None, skos_namespace.related, None) ):
    mesh_link_count[s] += 1

In [20]:
mesh_keys = set(mesh_link_count.keys())
ne_keys = set(ne_link_count.keys())

In [21]:
kw_link_count = dict()

for kw in keywords:
    kw_link_count[kw] = []
    if kw in mesh_keys:
        kw_link_count[kw].append(mesh_link_count[kw])
    else:
        kw_link_count[kw].append(0)
    if kw in ne_keys:
        kw_link_count[kw].append(ne_link_count[kw])
    else:
        kw_link_count[kw].append(0)

Convert the dictionary above to data frame to find some statistics

In [23]:
# convert the dictionary above to data frame to find some statistics
df = pd.DataFrame(kw_link_count,index=['mesh', 'ne'])
new_df = df.T

In [25]:
print("Median of mesh links:", new_df['mesh'].value_counts().median())

Median of mesh links: 8.0


In [26]:
print("Median of named entity links:", new_df['ne'].value_counts().median())

Median of named entity links: 38.0


Check if the amount of keywords in graph equals the amount of keywords that have a link.

In [68]:
set(kw_link_count.keys())==set(keywords)

True