# Analyse Medical Subject Headings (MeSH)

### Importing Required Libraries

In this cell, we import the necessary libraries for our code. The libraries we import are:

- `rdflib`: A library for working with RDF (Resource Description Framework) data.
- `Graph`: A class from `rdflib` that represents an RDF graph.
- `RDFS`: A namespace from `rdflib` that provides access to RDF Schema vocabulary.
- `URIRef`: A class from `rdflib` that represents a URI reference.
- `numpy`: A library for working with arrays and matrices.
- `urllib3`: A library for making HTTP requests.

These libraries are required for the code to function properly.

In [1]:
from rdflib import Graph
from rdflib.namespace import RDFS
from rdflib import URIRef
import rdflib
import json
from collections import deque
import numpy as np
import urllib3

In [2]:
input_file = "mesh2024.nt" #path to MeSH RDF file

### Creating RDF Graph

In the following, we create an RDF graph using the `rdflib` library. The graph is initialised as an empty graph using the `Graph()` function.

In [3]:
g = Graph()

In [4]:
g.parse(input_file, format="nt")

with open('mesh2024graph.ttl', 'wb') as f:
    g.serialize(f, format='turtle')

In [5]:
g.parse('mesh2023graph.ttl', format='turtle')

In [6]:
topicalDescriptors = dict()

### Querying Topical Descriptors

In this cell, we execute a SPARQL query to retrieve the distinct pairs of Topical Descriptors and their corresponding preferred Concepts from the RDF graph.

The query selects the Topical Descriptors (`?td`) that are of type `<http://id.nlm.nih.gov/mesh/vocab#TopicalDescriptor>` and have a preferred Concept (`?concept`).

The result of the query is stored in the variable `qres`.

Here's the code:

In [7]:
qres = g.query(
    """SELECT DISTINCT ?td ?concept
       WHERE {
          ?td a <http://id.nlm.nih.gov/mesh/vocab#TopicalDescriptor> .
          ?td <http://id.nlm.nih.gov/mesh/vocab#preferredConcept> ?concept .
       }""")

In [8]:
def gfp(string):
    return string.split('/')[-1]

In [9]:
def tp(string):
    return string.lower()

In [10]:
for row in qres:
    td_key = gfp(row[0])
    concept_value = gfp(row[1])
    topicalDescriptors[td_key] = concept_value

In [11]:
print(len(topicalDescriptors))

In [14]:
concepts = dict()

In [15]:
# GET CONCEPTS


qres = g.query(
"""SELECT ?sub ?pred ?obj
   WHERE {
      ?sub a <http://id.nlm.nih.gov/mesh/vocab#Concept> .
      ?sub ?pred ?obj .
   }""")


for row in qres:
    
    concept = gfp(row[0])
    predicate = str(row[1])
    t_object = gfp(row[2])
    if concept not in concepts:
        concepts[concept] = dict()

    if predicate == "http://id.nlm.nih.gov/mesh/vocab#relatedConcept":
        if "relatedConcept" not in concepts[concept]:
            concepts[concept]["relatedConcept"] = list()
        concepts[concept]["relatedConcept"].append(t_object)


    if predicate == "http://id.nlm.nih.gov/mesh/vocab#term":
        if "term" not in concepts[concept]:
            concepts[concept]["term"] = list()
        concepts[concept]["term"].append(t_object)

    if predicate == "http://id.nlm.nih.gov/mesh/vocab#preferredTerm":  
        concepts[concept]["preferredTerm"] = t_object

    if predicate == "http://www.w3.org/2000/01/rdf-schema#label":
        concepts[concept]["label"] = t_object

print(len(concepts))

with open("mesh-concepts.json", "w") as ff:
    json.dump(concepts,ff, indent=4)
    
#Print samples
print(concepts["M000616231"])

In [16]:
# GET MESH TERMS
meshTerms = dict()

qres = g.query(
    """SELECT DISTINCT ?term ?label
       WHERE {
          ?term a <http://id.nlm.nih.gov/mesh/vocab#Term> .
          ?term <http://id.nlm.nih.gov/mesh/vocab#prefLabel> ?label .
          
          FILTER (lang(?label) = 'en')
       }""")

for row in qres:
    meshTerms[gfp(row[0])] = row[1]
    
len(meshTerms)

with open("mesh-terms.json", "w") as ff:
    json.dump(meshTerms,ff, indent=4)

#sample print
print(meshTerms["T007269"])

In [17]:
# GET ALTERNATIVE TERMS
altMeshTerms = dict()
qres = g.query(
    """SELECT DISTINCT ?term ?label
       WHERE {
          ?term a <http://id.nlm.nih.gov/mesh/vocab#Term> .
          ?term <http://id.nlm.nih.gov/mesh/vocab#altLabel> ?label .
          
          FILTER (lang(?label) = 'en')
       }""")

for row in qres:
    #print(gfp(row[0]), row[1])
    subj = gfp(row[0])
    if subj not in altMeshTerms:
        altMeshTerms[subj] = list()
    altMeshTerms[subj].append(row[1])
    
with open("mesh-alt-terms.json", "w") as ff:
    json.dump(meshTerms,ff, indent=4)
    
print(altMeshTerms["T000013"])

In [18]:
# JUST IN CASE YOU WANT TO RELOAD THEM

RELOAD = False

if RELOAD:
    with open("mesh-topicalDescriptors.json",'r') as ff:
        topicalDescriptors = json.load(ff)
    with open("mesh-concepts.json", "r") as ff:
        concepts = json.load(ff)
    with open("mesh-terms.json", "r") as ff:
        meshTerms = json.load(ff)
    with open("mesh-alt-terms.json", "r") as ff:
        altMeshTerms = json.load(ff)

In [19]:
mesh_topics = dict()
mesh_topics_wu = dict()
mesh_broaders = dict()
mesh_narrowers = dict()
mesh_same_as = dict()
mesh_primary_labels = dict()
mesh_primary_labels_wu = dict()
mesh_topic_stems = dict()

In [20]:
# CREATING BROADER - NARROWER

narrowers = dict()
broaders = dict()



bd = rdflib.term.URIRef("http://id.nlm.nih.gov/mesh/vocab#broaderDescriptor")
for o, p, s in g.triples((None, bd, None)): ####### <- be careful o,p,s
    #print("{} {}".format(s,o))
    subj = gfp(s)
    obj = gfp(o)
    if subj not in narrowers:
        narrowers[subj] = list()
    if obj not in broaders:
        broaders[obj] = list()
    
    narrowers[subj].append(obj)
    broaders[obj].append(subj)

for key,value in broaders.items():
    broaders[key] = list(np.unique(value))
for key,value in narrowers.items():
    narrowers[key] = list(np.unique(value)) 

print(len(broaders))
print(len(narrowers))
with open("mesh-broaders.json", "w") as ff:
    json.dump(broaders,ff, indent=4)
with open("mesh-narrowers.json", "w") as ff:
    json.dump(narrowers,ff, indent=4)

In [21]:
def glotd(key, tp, cs, mt): #get_label_of_topicalDescriptor
    """
    tp = topicalDescriptors
    cs = concepts
    mt = meshterms
    """
    t_concept = tp[key]
    if "preferredTerm" in concepts[t_concept]:
        return str(mt[concepts[t_concept]["preferredTerm"]]).lower()
    else:
        return str(concepts[t_concept]["label"]).lower()
#sample 
print(glotd("D002493", topicalDescriptors, concepts, meshTerms))

In [22]:
# SAME-AS and PRIMARY LABELS CONSTRUCTION
sim_labels = dict()
for key, value in topicalDescriptors.items():
    t_label = glotd(key, topicalDescriptors, concepts, meshTerms)
    
    sim_labels[t_label] = list()
    
    if concepts[topicalDescriptors[key]]["preferredTerm"] in altMeshTerms:
        sim_labels[t_label] += [str(k).lower() for k in altMeshTerms[concepts[topicalDescriptors[key]]["preferredTerm"]]]
    
    if "term" in concepts[topicalDescriptors[key]]:
        additional_terms = concepts[topicalDescriptors[key]]["term"]
        sim_labels[t_label] += [str(meshTerms[k]).lower() for k in additional_terms]
        for t_term in additional_terms:
            if t_term in altMeshTerms:
                sim_labels[t_label] += [str(k).lower() for k in altMeshTerms[t_term]]
    
    if "relatedConcept" in concepts[topicalDescriptors[key]]:
        additional_concepts = concepts[topicalDescriptors[key]]["relatedConcept"]
        for t_concept in additional_concepts:
            sim_labels[t_label].append(concepts[t_concept]["label"].lower())
            if "term" in concepts[t_concept]:
                additional_terms = concepts[t_concept]["term"]
                sim_labels[t_label] += [str(meshTerms[k]).lower() for k in additional_terms]
                for t_term in additional_terms:
                    if t_term in altMeshTerms:
                        sim_labels[t_label] += [str(k).lower() for k in altMeshTerms[t_term]]
    
    
    if len(sim_labels[t_label]) == 0:
        del sim_labels[t_label]

for key,value in sim_labels.items():
    sim_labels[key] = list(np.unique(value))
    
#sample print
print(sim_labels["prion proteins"])
with open("mesh-sim-labels.json", "w") as ff:
    json.dump(sim_labels,ff, indent=4)

In [23]:
same_as = dict()
primary_labels = dict()
for key, value in sim_labels.items():
    t_list = value.copy()
    t_list.append(key)
    for item in t_list:
        same_as[item] = [x for x in t_list if x != item]
        primary_labels[item] = key
        
print(primary_labels["prp proteins"])
print(same_as["prp proteins"])

In [24]:
print(same_as["abdomen, acute"])

In [25]:
for key, value in narrowers.items():
    try:
        mesh_narrowers[glotd(key, topicalDescriptors, concepts, meshTerms)] = [glotd(k, topicalDescriptors, concepts, meshTerms) for k in value]
    except:
        pass


    
for key, value in broaders.items():
    try:
        mesh_broaders[glotd(key, topicalDescriptors, concepts, meshTerms)] = [glotd(k, topicalDescriptors, concepts, meshTerms) for k in value]
    except:
        pass
    
for key,value in mesh_narrowers.items():
    mesh_narrowers[key] = list(np.unique(value))
for key,value in mesh_broaders.items():
    mesh_broaders[key] = list(np.unique(value))

mesh_same_as = same_as
mesh_primary_labels = primary_labels
for key, value in mesh_primary_labels.items():
    mesh_primary_labels_wu[key.replace(" ", "_")] = value.replace(" ", "_")

In [26]:
#GETTING LABELS

labels = list(set(mesh_same_as).union(set(mesh_narrowers),set(mesh_broaders)))
for label in labels:
    mesh_topics[label] = True
    mesh_topics_wu[label.replace(" ", "_")] = label

print(len(labels))

In [27]:
#GETTING TOPIC STEMS

for topic in mesh_topics.keys():
    if topic[:4] not in mesh_topic_stems:
        mesh_topic_stems[topic[:4]] = list()
    mesh_topic_stems[topic[:4]].append(topic)

In [28]:
#GETTING ALL BROADERS

def get_all_brach(bb, topic):
    all_broaders = list()
    queue = deque() 
    queue.append(topic)
    while len(queue) > 0:
        dequeued = queue.popleft()
        if dequeued in bb:
            broaders = bb[dequeued]
            for broader in broaders:
                queue.append(broader)
                all_broaders.append(broader)
    
    return list(set(all_broaders))

mesh_all_broaders = dict()
for key, value in mesh_broaders.items():
    try:
        mesh_all_broaders[key] = get_all_brach(mesh_broaders, key)
        if key == "hiv infections":
            print(mesh_all_broaders[key])
        
    except:
        pass

    
#print sample
print(mesh_all_broaders[list(mesh_all_broaders.keys())[0]])

In [29]:
topic = "hyperhidrosis"#"hiv infections"
print("broaders: {}".format(mesh_broaders[topic]))
print("narrowers: {}".format(mesh_narrowers[topic]))
print("ALL BROADERS: {}".format(mesh_all_broaders[topic]))

In [30]:
mesh = dict()
mesh["topics"] = mesh_topics
mesh["_topics"] = mesh_topics_wu
mesh["broaders"] = mesh_broaders
mesh["narrowers"] = mesh_narrowers
mesh["same_as"] = mesh_same_as
mesh["primary_labels"] = mesh_primary_labels
mesh["_primary_labels"] = mesh_primary_labels_wu
mesh["topic_stems"] = mesh_topic_stems 
mesh["all_broaders"] = mesh_all_broaders 

In [33]:
with open('mesh22023-11.json','w') as file:
    json.dump(mesh, file)