In [83]:
from rdflib import Graph, Literal, Namespace, RDF, URIRef

In [84]:
# load the keyword instances graph and bnode graph because this is the data we want to link (decided to focus ont these two for now and then later attempt with mesh)
kw = Graph()
ne = Graph()
kw.parse("./ParsingSpaqrlIntoTTLFile/Output-Graphs/HypothesisKeywordInstances.ttl", format="turtle")
ne.parse("./ParsingSpaqrlIntoTTLFile/Output-Graphs/bnode_graph_new.ttl", format="turtle")

<Graph identifier=N7f021a97a6fc43d5b3883895ac1d3cc6 (<class 'rdflib.graph.Graph'>)>

In [85]:
# get the namespaces for running the sparql queries in python
def create_namespace(graph, namespace, prefix):

    ns = Namespace(namespace)
    graph.namespace_manager.bind(prefix, namespace)
    
    return ns


hyp_namespace = create_namespace(ne, "http://example.org/hypothesis_ontology/", 'hyp')
oa_namespace = create_namespace(ne, "http://www.w3.org/ns/oa#", 'oa')

hyp_namespace = create_namespace(kw, "http://example.org/hypothesis_ontology/", 'hyp')
oa_namespace = create_namespace(kw, "http://www.w3.org/ns/oa#", 'oa')

# covidpr_namespace = create_namespace(bnode_g, "http://ns.inria.fr/covid19/property/", 'covidpr')
# dct_namespace = create_namespace(bnode_g, "http://purl.org/dc/terms/", 'dct')
# covid_namespace = create_namespace(bnode_g, "http://ns.inria.fr/covid19/", 'covid')
# prov_namespace = create_namespace(bnode_g, 'http://www.w3.org/TR/prov-o/', 'prov')

Notes: From this point, I was trying to get the data from the graph and retrieve it in a way that makes it easier to compare the literals of the instances. First I though going through the triples is a possible way, but now I think performing sparql queries and organzing the retrieved data might be more promising... I decided to try storing the info in dictionaries for now and then my idea was to loop through them both and perform some comparing methods. Through this, I would store the  keyword uri and the named entity uri that shared similar literals in a tuple. Finally, I would create triples of these with rdflib through g.add((keywordURI, skos.related, namedentityURI)) and save these to a turtle file. 

In [86]:
def get_abstracts(graph):
    abstracts = []
    for s, p, o in graph.triples( (None, oa_namespace.hasSource, None) ):
        keyword = graph.value(s,p)
        abstracts.append(keyword)
    return abstracts

# get_abstracts(kw)

In [87]:
len(set(get_abstracts(kw)))

840

In [88]:
ne_q = ne.query(
"""
prefix dct: <http://purl.org/dc/terms/> 
prefix hyp: <http://example.org/hypothesis_ontology/> 
prefix oa: <http://www.w3.org/ns/oa#>
SELECT ?ne ?literal ?abstract
WHERE {
  ?ne oa:hasSource ?abstract; oa:exact ?literal.
}
"""
)

ne_data = []
for i in ne_q:
    ne_dict = dict()
    ne_dict['ne_id'] = i.ne
    ne_dict['literal'] = i.literal
    ne_dict['abstract'] = i.abstract
    ne_data.append(ne_dict)

In [89]:
kw_q = kw.query(
"""
prefix dct: <http://purl.org/dc/terms/> 
prefix hyp: <http://example.org/hypothesis_ontology/> 
prefix oa: <http://www.w3.org/ns/oa#>
SELECT ?kw ?literal ?abstract
WHERE {
  ?hyp oa:hasSource ?abstract; hyp:contains ?kw.
  ?kw oa:hasTarget ?literal.
}
"""
)

kw_data = []
for i in kw_q:
    kw_dict = dict()
    kw_dict['kw_id'] = i.kw
    kw_dict['literal'] = i.literal
    kw_dict['abstract'] = i.abstract
    kw_data.append(kw_dict)

In [90]:
ne_data[10]['literal']

rdflib.term.Literal('RT-PCR')

In [91]:
print(len(ne_data))

9996


In [92]:
print(len(kw_data))

5710


### Filtering duplicates

Here I delete duplicates both in ne_data and kw_data. Basically I check for every pair (ne, ne2) from ne_data if literals, abstracts are the same and if ids are different. In that case we delete the second one, because of information redundancy. Same thing is done for kw_data.

In [93]:
for ne in ne_data:
    for ne2 in ne_data:
        if ne['abstract'] == ne2['abstract'] and ne['literal'] == ne2['literal'] and ne['ne_id'] != ne2['ne_id']:
            ne_data.remove(ne2)
            
print(len(ne_data))

8058


In [95]:
for kw in kw_data:
    for kw2 in kw_data:
        if kw['abstract'] == kw2['abstract'] and kw['literal'] == kw2['literal'] and kw['kw_id'] != kw2['kw_id']:
            kw_data.remove(kw2)
            
print(len(kw_data))

5593


## Adding skos triples

In every case I add skos.related triples only for keywords and named entities that have lenght of literal >= 3. It filters out keywords or named entities that represent pointless nodes (like literals: " ", "-", "", "OC" etc.).
Algorithm works like:
1. For every keyword that has length >=3
2. For every named entity that has length >=3
3. Check if skos.related applies to a pair (keyword, named entity).

### When we consider skos.related if literals are exactly the same.

Here I assume that keyword and named entities HAVE TO BE from the same abstract. In case they are from the same abstract, skos.related appears only in case literals are exactly the same.

In [96]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if kw['abstract'] != ne['abstract']:
            continue
        if str(kw['literal']) == str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_same_abstract_equal.ttl', format="turtle")
print(len(g))

433


Here I assume that keyword and named entities DOES NOT HAVE TO BE from the same abstract. skos.related appears only in case literals are exactly the same.

In [97]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if str(kw['literal']) == str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_equal.ttl', format="turtle")
print(len(g))

6987


### When we consider skos.related if one of the literals is contained in another one.

Here I assume that keyword and named entities HAVE TO BE from the same abstract. In case they are from the same abstract, skos.related appears only in case literal of keyword is inside literal of named entity, or named entity literal is inside keyword literal.

In [98]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if kw['abstract'] != ne['abstract']:
            continue
        if str(kw['literal']) in str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
        elif str(ne['literal']) in str(kw['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_same_abstract_contain.ttl', format="turtle")
print(len(g))

1790


Here I assume that keyword and named entities DOES NOT HAVE TO BE from the same abstract. skos.related appears only in case literal of keyword is inside literal of named entity, or named entity literal is inside keyword literal.

In [99]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')
for kw in kw_data:
    if len(str(kw['literal'])) < 3:
        continue
    for ne in ne_data:
        if len(str(ne['literal'])) < 3:
            continue
        if str(kw['literal']) in str(ne['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
        elif str(ne['literal']) in str(kw['literal']):
            g.add((kw['kw_id'],skos_namespace.related, ne['ne_id']))
            
g.serialize('./Output-Graphs/skos_contain.ttl', format="turtle")
print(len(g))

92880
