In [1]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, RDFS
from collections import defaultdict 

In [2]:
def parse_graph(file):
    g = Graph()
    g.parse(file, format='turtle')
    
    return g

def create_namespace(graph, namespace, prefix):

    ns = Namespace(namespace)
    graph.namespace_manager.bind(prefix, namespace)
    
    return ns

### Complete Graph Stats

In [3]:
complete_g = parse_graph("./Parsing/outputs/complete_graph.ttl")
mesh_g = parse_graph("./mesh/mesh_graph.ttl")
hyp_g = parse_graph("./Parsing/outputs/hypothesis_main_graph.ttl")
bnode_g = parse_graph("./Parsing/outputs/bnode_graph.ttl")

# number of relations
complete_rels = len(list(complete_g.predicates())) 
print("number relations:", complete_rels)

# number mesh concepts
total_mesh = len(list(mesh_g.subjects())) 
print("total mesh concepts:", total_mesh)

#number paper/hypothesis/provenance nodes
hyp_namespace = create_namespace(complete_g, "http://example.org/hypothesis_ontology/", 'hyp')
total_paper = []
total_provenance = []
total_hypothesis = []

for s,p,o in complete_g.triples( (None, None, hyp_namespace.Paper) ):
    total_paper.append(s)
print("total paper instances:", len(total_paper))

for s,p,o in complete_g.triples( (None, None, hyp_namespace.Provenance) ):
    total_provenance.append(s)
print("total provenance instances:", len(total_provenance))

for s,p,o in complete_g.triples( (None, None, hyp_namespace.Hypothesis) ):
    total_hypothesis.append(s)
print("total hypothesis instances:", len(total_hypothesis))

#number named entity nodes
prov_namespace = create_namespace(complete_g, 'http://www.w3.org/TR/prov-o/', 'prov')
total_ne = []
for s,p,o in complete_g.triples( (None, None, prov_namespace.Entity) ):
    total_ne.append(s)
print("total Named Entity instances:", len(total_ne))

number relations: 140265
total mesh concepts: 12945
total paper instances: 1222
total provenance instances: 1073
total hypothesis instances: 1221
total Named Entity instances: 10000


In [4]:
# complete_g = parse_graph("./Parsing/outputs/complete_graph.ttl")
# mesh_g = parse_graph("./mesh/mesh_graph.ttl")
# hyp_g = parse_graph("./Parsing/outputs/hypothesis_main_graph.ttl")
# bnode_g = parse_graph("./Parsing/outputs/bnode_graph.ttl")

# # number of relations
# complete_rels = len(list(complete_g.predicates())) 
# print("number relations:", complete_rels)

# # number mesh concepts
# total_mesh = len(list(mesh_g.subjects())) 
# print("total mesh concepts:", total_mesh)

# #number paper/hypothesis/provenance nodes
# hyp_namespace = create_namespace(hyp_g, "http://example.org/hypothesis_ontology/", 'hyp')
# total_paper = []
# total_provenance = []
# total_hypothesis = []

# for s,p,o in hyp_g.triples( (None, None, hyp_namespace.Paper) ):
#     total_paper.append(s)
# print("total paper concepts:", len(total_paper))

# for s,p,o in hyp_g.triples( (None, None, hyp_namespace.Provenance) ):
#     total_provenance.append(s)
# print("total provenance concepts:", len(total_provenance))

# for s,p,o in hyp_g.triples( (None, None, hyp_namespace.Hypothesis) ):
#     total_hypothesis.append(s)
# print("total hypothesis concepts:", len(total_hypothesis))

# #number named entity nodes
# prov_namespace = create_namespace(bnode_g, 'http://www.w3.org/TR/prov-o/', 'prov')
# total_ne = []
# for s,p,o in bnode_g.triples( (None, None, prov_namespace.Entity) ):
#     total_ne.append(s)
# print("total Named Entity concepts:", len(total_ne))

### Evaluation Links

In [4]:
kw_file = "./Parsing/outputs/hypothesis-keywords-graph.ttl"
ne_links_file = "./Linking/skos_same_abstract_contain.ttl"
mesh_links_file = "./Linking/mesh_links_contain.ttl"

In [5]:
kw_g = parse_graph(kw_file)
ne_links_g = parse_graph(ne_links_file)
mesh_links_g = parse_graph(mesh_links_file)

In [6]:
oa_namespace = create_namespace(kw_g, "http://www.w3.org/ns/oa#", 'oa')
skos_namespace = create_namespace(mesh_links_g, "http://www.w3.org/2004/02/skos/core#", 'skos')
skos_namespace = create_namespace(ne_links_g, "http://www.w3.org/2004/02/skos/core#", 'skos')

In [7]:
keywords = []
for s,p,o in kw_g.triples( (None, oa_namespace.hasTarget, None) ):
    keywords.append(s)
    
print("There are", len(keywords), "keywords")

There are 7421 keywords


### get total amount of links per keyword for both datasets and collect it in a dictionary

In [8]:
ne_link_count = defaultdict(int)
mesh_link_count = defaultdict(int)

In [9]:
for s,p,o in ne_links_g.triples( (None, skos_namespace.related, None) ):
    ne_link_count[s] += 1

for s,p,o in mesh_links_g.triples( (None, skos_namespace.related, None) ):
    mesh_link_count[s] += 1

In [10]:
mesh_link_count

defaultdict(int,
            {rdflib.term.URIRef('http://example.org/hypothesis_ontology/50d5965d-86a5-4dab-a2be-2ed66b70a56f#keyword'): 281,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/ce2a39ed-7d36-419f-9dd0-c1ee9251a537#keyword'): 88,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/f7673d6b-c9f0-4d7d-8eae-18a23758f1da#keyword'): 88,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/9af60d75-60e0-4b18-8177-e1c44c6e9f17#keyword'): 72,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/6f8fdf62-8433-4bab-ada9-0ba67b6efe13#keyword'): 88,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/c58ad29d-2a5b-4f2f-8fb0-b305bf832268#keyword'): 73,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/29077376-9338-4030-841f-73013233156a#keyword'): 73,
             rdflib.term.URIRef('http://example.org/hypothesis_ontology/aaff1455-767c-4a90-96cd-ad2416bbc79b#keywo

In [11]:
mesh_keys = set(mesh_link_count.keys())
print("There are", len(mesh_keys), "keywords linked with a mesh concept")
ne_keys = set(ne_link_count.keys())
print("There are", len(ne_keys), "keywords linked with a named entity fishing concept")

There are 1167 keywords linked with a mesh concept
There are 1413 keywords linked with a named entity fishing concept


In [13]:
kw_link_count = dict()

for kw in keywords:
    kw_link_count[kw] = []
    if kw in mesh_keys:
        kw_link_count[kw].append(mesh_link_count[kw])
    else:
        kw_link_count[kw].append(0)
    if kw in ne_keys:
        kw_link_count[kw].append(ne_link_count[kw])
    else:
        kw_link_count[kw].append(0)

Convert the dictionary above to dataframe to find some statistics

In [14]:
# convert the dictionary above to data frame to find some statistics
df = pd.DataFrame(kw_link_count,index=['mesh', 'ne'])

In [15]:
new_df = df.T

In [16]:
print("Median of mesh links:", new_df['mesh'].value_counts().median())

Median of mesh links: 8.0


In [17]:
new_df['mesh'].value_counts()

0      6254
1       600
2       161
3       134
72       26
6        26
4        22
23       19
83       19
88       19
30       13
9        13
7        12
19       11
14       10
5        10
24        9
13        8
8         8
15        7
281       6
33        6
84        5
31        4
73        3
89        3
10        2
34        2
74        2
87        2
17        1
22        1
26        1
32        1
12        1
Name: mesh, dtype: int64

In [18]:
# number of mesh concepts linked
mesh_concepts = set(list(mesh_links_g.objects()))
unique_mesh_linked = len(mesh_concepts)
percent_mesh_linked = unique_mesh_linked/12945
percent_mesh_linked

0.10652761684047896

In [19]:
# number of NEKG NE concepts linked
ne_concepts = list(ne_links_g.objects())
unique_ne_linked = len(set(ne_concepts))
percent_ne_linked = unique_ne_linked/10000
percent_ne_linked

1558


0.1558

In [20]:
# observing certain mesh instances 
print(new_df[new_df['mesh']==281].index.values)

[rdflib.term.URIRef('http://example.org/hypothesis_ontology/ced093f3-164e-4a03-bbdd-5e43629bb78c#keyword')
 rdflib.term.URIRef('http://example.org/hypothesis_ontology/50d5965d-86a5-4dab-a2be-2ed66b70a56f#keyword')
 rdflib.term.URIRef('http://example.org/hypothesis_ontology/36e489fc-6558-4ddc-b37a-d8e8a260273d#keyword')
 rdflib.term.URIRef('http://example.org/hypothesis_ontology/6fbb6152-2d91-4583-a974-49b671af1c2a#keyword')
 rdflib.term.URIRef('http://example.org/hypothesis_ontology/05d4b17b-7a30-41be-82d1-1aeee5fd68e8#keyword')
 rdflib.term.URIRef('http://example.org/hypothesis_ontology/ae601a76-31ea-438c-a691-2f46d1634e3f#keyword')]


In [21]:
print("Median of named entity links:", new_df['ne'].value_counts().median())

Median of named entity links: 38.0


In [22]:
new_df['ne'].value_counts()

0    6008
1    1149
2     216
3      38
4       8
6       1
5       1
Name: ne, dtype: int64

In [23]:
mesh_concepts = set(list(mesh_links_g.objects()))
unique_mesh_linked = len(mesh_concepts)
percent_mesh_linked = unique_mesh_linked/12945
percent_mesh_linked

0.10652761684047896

In [24]:
# count how many keywords have no links
count = 0
for key, value in kw_link_count.items():
     if value[0] == 0 and value[1] == 0:
            count+=1
print(1- count/7421 )

0.2925481741005256


In [38]:
# get the top linked mesh concepts
mesh_concepts_links = defaultdict(int)

for s,p,o in mesh_links_g.triples( (None, skos_namespace.related, None) ):
    mesh_concepts_links[o] += 1

sorted_mesh_links = sorted(mesh_concepts_links.items(), key=lambda x: x[1], reverse=True)

for s,p,o in mesh_g.triples( (None, RDFS.label, None) ):
    for b in sorted_mesh_links[:10]:
        if s == b[0]:
            print(o)

Cells, Immobilized
RNA, Transfer, Pro
Viruses, Unclassified
Proteins
Cells
RNA, Transfer, His
Disease
Cells, Cultured
RNA, Antisense
Viruses


In [44]:
# get the top linked named entity concepts
ne_concepts_links = defaultdict(int)

for s,p,o in ne_links_g.triples( (None, skos_namespace.related, None) ):
    ne_concepts_links[o] += 1

sorted_ne_links = sorted(ne_concepts_links.items(), key=lambda x: x[1], reverse=True)

for s,p,o in complete_g.triples( (None, oa_namespace.exact, None) ):
    for b in sorted_ne_links[:10]:
        if s == b[0]:
            print(o)

ACE
PMWS
gene
OAS
pDC
epithelial
CoV
hypothalamus-pituitary-adrenal (HPA) axis
HCV
ubiquitin
