In [2]:
!pip install neo4j



In [1]:
from neo4j import GraphDatabase
import pandas as pd

In [17]:
uri = "neo4j://localhost:7687"  # Replace with your URI 
user = "neo4j"                 # Replace with your username
password = "19091998"          # Replace with your password

driver = GraphDatabase.driver(uri, auth=(user, password))

In [18]:
class Neo4jConnection:
    
    def __init__(self, uri, user, password):
        self.__uri = uri
        self.__user = user
        self.__password = password
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__password))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def execute_query(self, query):
        if self.__driver is not None:
            with self.__driver.session() as session:
                return [record for record in session.run(query)]


In [19]:
# label fields in the snomed ct graph

q = """ MATCH (n:ObjectConcept)WHERE (n.FSN ENDS WITH '(finding)' or n.FSN ENDS WITH '(disorder)' )
        SET n:SP
        RETURN n LIMIT 10 """

In [27]:
conn = Neo4jConnection(uri, user, password)

query = """ MATCH (c:SP_ALL)-[:HAS_DESCRIPTION]->(d:Description)
            WHERE d.descriptionType = 'Synonym'
            RETURN c.id AS ConceptId, c.FSN AS FSN_concept, d.term AS Synonym
"""

results = conn.execute_query(query)

results[0]['Synonym']

'Local excision of lesion or tissue of patella'

In [28]:
r_df = pd.DataFrame(results, columns=['ConceptId', 'FSN_concept', 'Synonym'])
r_df.to_csv('SP_ALL_EXT.csv')

In [None]:
# Run faiss indexing on all the synonyms, find most similar word

In [None]:
# find node belonging to the synonym

In [18]:
# Find shortest path between concepts 
query = """ MATCH p = shortestPath((start:SP {id: "249473004"})-[*..10]-(end:SP {id: "102608004"}))
            RETURN length(p) as distance """

results = conn.execute_query(query)

for record in results:
    print(record)

  with self.__driver.session() as session:


<Record distance=3>


In [20]:
query = """ WITH ["249473004","289161009", "223123009", "289163007", "64379006"] AS endNodeIds
            MATCH (start:SP {id: "102608004"}) 
            UNWIND endNodeIds AS endNodeId
            MATCH (end:SP {id: endNodeId})
            CALL {
                WITH start, end
                MATCH p = shortestPath((start)-[*..10]-(end))
                RETURN p
                ORDER BY length(p) ASC LIMIT 1
            }
            RETURN p, length(p) as distance, end.id as endNodeId, start.FSN, end.FSN as NameEndNode """

results = conn.execute_query(query)

results

  with self.__driver.session() as session:


[<Record p=<Path start=<Node element_id='4:0c414c0f-b769-4d23-ad25-650efdf1893b:99185' labels=frozenset({'Finding', 'SP', 'SPFindings', 'ObjectConcept'}) properties={'FSN': 'Excessive appetite (finding)', 'nodetype': 'concept', 'effectiveTime': '20080731', 'definitionStatusId': '900000000000073002', 'active': '1', 'history': '[{"id": "102608004", "effectiveTime": "20020131", "active": "1", "moduleId": "900000000000207008", "definitionStatusId": "900000000000074008"}]', 'id': '102608004', 'moduleId': '900000000000207008', 'sctid': '102608004'}> end=<Node element_id='4:0c414c0f-b769-4d23-ad25-650efdf1893b:240538' labels=frozenset({'Finding', 'SP', 'SPFindings', 'ObjectConcept'}) properties={'FSN': 'Altered appetite (finding)', 'nodetype': 'concept', 'effectiveTime': '20080731', 'definitionStatusId': '900000000000073002', 'active': '1', 'history': '[{"id": "249473004", "effectiveTime": "20020131", "active": "1", "moduleId": "900000000000207008", "definitionStatusId": "900000000000074008"}

In [None]:
# extract the shortest path and handle codes with the exact same length 

In [None]:
# examples of failed matchings in the app 

In [None]:
conn.close()

In [38]:
# extract the number of descendants and asc for each node 

conn = Neo4jConnection(uri, user, password)
query = """ MATCH (c:SP)
OPTIONAL MATCH (c)-[:ISA*]->(descendant:SP)
WITH c, COLLECT(DISTINCT descendant.id) AS DescendantIDs, COLLECT(DISTINCT descendant.FSN) AS DescendantFSN
RETURN c.id AS ConceptId, c.FSN AS FSN_concept,  SIZE(DescendantIDs) as num_desc, DescendantIDs, DescendantFSN
"""

results = conn.execute_query(query)
pd.DataFrame(results).to_csv('descendants.csv', index=False)
results[0]

<Record ConceptId='105000' FSN_concept='Poisoning by pharmaceutical excipient (disorder)' num_desc=8 DescendantIDs=['7895008', '55680006', '441952005', '75478009', '87858002', '25508008', '64572001', '404684003'] DescendantFSN=['Poisoning caused by drug AND/OR medicinal substance (disorder)', 'Drug overdose (disorder)', 'Poisoning caused by chemical substance (disorder)', 'Poisoning (disorder)', 'Drug-related disorder (disorder)', 'Pathological drug intoxication (disorder)', 'Disease (disorder)', 'Clinical finding (finding)']>

In [35]:
conn = Neo4jConnection(uri, user, password)
query = """ MATCH (c:SP)
OPTIONAL MATCH (ancestor:SP)-[:ISA*]->(c)
WITH c, COLLECT(DISTINCT ancestor.id) AS AncestorIDs, COLLECT(DISTINCT ancestor.FSN) AS AncestorFSN
RETURN c.id AS ConceptId, c.FSN AS FSN_concept,  SIZE(AncestorIDs) as num_anc, AncestorIDs, AncestorFSN
"""

results = conn.execute_query(query)
pd.DataFrame(results, columns=['ConceptId', 'FSN_concept', 'num_anc', 'AncestorIDs', 'AncestorFSN']).to_csv('ascendants.csv', index=False)
results[0]

<Record ConceptId='105000' FSN_concept='Poisoning by pharmaceutical excipient (disorder)' num_anc=0 AncestorIDs=[] AncestorFSN=[]>

In [40]:
ant_df = pd.read_csv('ascendants.csv')
des_df = pd.read_csv('descendants.csv')
ant_df

Unnamed: 0,0,1,2,3,4
0,105000,Poisoning by pharmaceutical excipient (disorder),0,[],[]
1,109006,Anxiety disorder of childhood OR adolescence (...,11,"['90790003', '83253003', '64165008', '53467004...",['Avoidant disorder of adolescence (disorder)'...
2,122003,Choroidal hemorrhage (disorder),13,"['193474002', '336061000119102', '341671000119...",['Choroidal hemorrhage and rupture (disorder)'...
3,123008,Channel catfish virus disease (disorder),0,[],[]
4,127009,Miscarriage with laceration of cervix (disorder),0,[],[]
...,...,...,...,...,...
179062,972604701000119104,Acquired arteriovenous malformation of vascula...,0,[],[]
179063,978253001000132109,Small bowel enteroscopy normal (finding),0,[],[]
179064,985355341000119101,Malignant melanoma of skin of left wrist (diso...,0,[],[]
179065,987840791000119102,Adenosine deaminase 2 deficiency (disorder),0,[],[]


In [42]:
ant_df.columns = ['ConceptId', 'FSN_concept', 'num_anc', 'AncestorIDs', 'AncestorFSN']
des_df.columns = ['ConceptId', 'FSN_concept', 'num_des', 'DescendantIDs', 'DescendantFSN']

In [36]:
import numpy as np

def compute_similar_names(name, l):
    sim = []
    for i in l:
        s = False
        c=0
        x = True if name in i else False
        if x: 
            s = True
        
        tk_name = name.lower().split()
        for k in tk_name:
            c += 1 if k in i else 0
        
        if c/len(tk_name)>=0.5:
            s = True
        
        sim.append(s)
    np.mean(sim)
    return sim, np.mean(sim)

compute_similar_names('aina tersol', ['aina', 'aina goes to schol'])

([True, True], 1.0)

In [44]:
ant_df[['sim', 'mean']] = ant_df.apply(lambda x: compute_similar_names(x['FSN_concept'], x['AncestorFSN']), axis=1)
ant_df

In [None]:
des_df[['sim', 'mean']] = des_df.apply(lambda x: compute_similar_names(x['FSN_concept'], x['DescendantFSN']), axis=1)
des_df