In [1]:
from rdflib import Graph, Literal, Namespace, RDF, URIRef

In [17]:
def create_graph(graph_file):
    g = Graph()
    g.parse(graph_file, format="turtle")
    
    return g

kw_file = "./ParsingSpaqrlIntoTTLFile/Output-Graphs/HypothesisKeywordInstances.ttl"
kw = create_graph(kw_file)

mesh_file = "./mesh/mesh_graph.ttl"
mesh = create_graph(mesh_file)

In [3]:
# get the namespaces for running the sparql queries in python
def create_namespace(graph, namespace, prefix):

    ns = Namespace(namespace)
    graph.namespace_manager.bind(prefix, namespace)
    
    return ns


hyp_namespace = create_namespace(kw, "http://example.org/hypothesis_ontology/", 'hyp')
oa_namespace = create_namespace(kw, "http://www.w3.org/ns/oa#", 'oa')

In [4]:
kw_q = kw.query(
"""
prefix dct: <http://purl.org/dc/terms/> 
prefix hyp: <http://example.org/hypothesis_ontology/> 
prefix oa: <http://www.w3.org/ns/oa#>
SELECT ?kw ?literal ?abstract
WHERE {
  ?hyp oa:hasSource ?abstract; hyp:contains ?kw.
  ?kw oa:hasTarget ?literal.
}
"""
)

kw_data = []
for i in kw_q:
    kw_dict = dict()
    kw_dict['kw_id'] = i.kw
    kw_dict['literal'] = i.literal
    kw_dict['abstract'] = i.abstract
    kw_data.append(kw_dict)

In [5]:
# is there a way to make this faster?
for kw in kw_data:
    for kw2 in kw_data:
        if kw['abstract'] == kw2['abstract'] and kw['literal'] == kw2['literal'] and kw['kw_id'] != kw2['kw_id']:
            kw_data.remove(kw2)
            
# print(len(kw_data))

In [18]:
# get mesh data
mesh_q = mesh.query(
"""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 

SELECT ?mesh_id ?literal WHERE { 
  ?mesh_id rdfs:label ?literal.
} 

""")
mesh_data = []
for i in mesh_q:
    mesh_dict = dict()
    mesh_dict['mesh_id'] = i.mesh_id
    mesh_dict['literal'] = i.literal
    mesh_data.append(mesh_dict)

In [12]:
g = Graph()
skos_namespace = create_namespace(g, "http://www.w3.org/2004/02/skos/core#", 'skos')

for kw in kw_data:
    words = str(kw['literal']).split(' ')
    if len(str(kw['literal'])) < 3:
        continue
    for word in words:
        if len(word) < 3:
            continue
        for mesh in mesh_data:
            if len(str(mesh['literal'])) < 3:
                continue
            if word in str(mesh['literal']):
                g.add((kw['kw_id'],skos_namespace.related, mesh['mesh_id']))  
#             if str(mesh['literal']) in str(kw['literal']):
#                 g.add((kw['kw_id'],skos_namespace.related, mesh['mesh_id']))            
g.serialize('./Output-Graphs/mesh_links_contain.ttl', format="turtle")
print(len(g))

497006


NOTE:
    <http://example.org/hypothesis_ontology/05b3c5d3-2e57-44dc-bfb4-0067b362d177#keyword> skos:related <http://id.nlm.nih.gov/mesh/M000662497>,
        
    kw = "plasma"
    many of the mesh labels are 'Mycoplasma'
    
This keyword has over 100 related links. Don't think all are that valid like the example above. Thus will split the mesh words into a list, and check if any of those are in the keyword.

In [19]:
g2 = Graph()
skos_namespace = create_namespace(g2, "http://www.w3.org/2004/02/skos/core#", 'skos')

for mesh in mesh_data:
    if len(str(mesh['literal'])) < 3:
        continue
    else:
        mesh_words = mesh['literal'].split(',')
        mesh_words_clean = [word.lower() for word in mesh_words]
        mesh['literal'] = mesh_words_clean

In [20]:
for mesh in mesh_data[:10]:
    print(mesh['literal'])

['dna repair-deficiency disorders']
['cyclosporine metabolite m21']
['mannose-1-phosphate guanylyltransferase (gdp)']
['alpha-guanidinoglutaric acid']
['novo-pyrexal']
['receptor', ' nerve growth factor']
['alpha-o-methyllanosol']
['alpha', 'alpha-trehalose-phosphate synthase(gdp-forming)']
['nsg1 protein', ' human']
['bacterial leucyl aminopeptidase']


In [24]:
g2 = Graph()
skos_namespace = create_namespace(g2, "http://www.w3.org/2004/02/skos/core#", 'skos')

for kw in kw_data:
    words = str(kw['literal']).lower().split(' ')
    if len(str(kw['literal'])) < 3:
        continue
    for mesh in mesh_data:
        for word in mesh['literal']:
            if len(word) < 3:
                continue
            elif word in words:
                g2.add((kw['kw_id'],skos_namespace.related, mesh['mesh_id']))            
g2.serialize('./Output-Graphs/mesh_links_contain3.ttl', format="turtle")
print(len(g2))

8988


<http://id.nlm.nih.gov/mesh/M0388427> rdfs:label "acetato(N,N,N',N'-tetrakis(2-pyridylmethyl)-1,3-diamino-2-propanol)dizinc"@en .
        
        - anything with N will be linked to this
        

or this: 

<http://id.nlm.nih.gov/mesh/M000636134> rdfs:label "benzyl 2-((6,9alpha-dimethyl-3-methylene-2-oxo-2,3,3alpha,4,5,8,9,9a,10alpha,10beta-decahydrooxireno (2',3':9,10)cyclodeca(1,2-b)furan-5-yl)oxy)-2-phenylacetate"@en .

catching the 5:     
        <http://example.org/hypothesis_ontology/035ff86c-f268-45cf-82f4-59830e035aa2#keyword> oa:hasTarget "the actual European CCR5 allelic frequencies" .
