1. Load CSV Data: Loads the CSV data into a pandas DataFrame.
2. Create Disease Nodes: Creates nodes for each disease and sets properties.
3. Create Hierarchical Relationships: Splits the ParentIDs field and creates SUB_CATEGORY_OF relationships between diseases.
4. Extract Disease names: Extracts disease names from Neo4j.
5. Generate Embeddings: Uses a local model to generate embeddings for the names (for now).
6. Update Neo4j: Stores the generated embeddings back into Neo4j.

In [13]:
import pandas as pd
from neo4j import GraphDatabase
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [21]:
# Load the embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [14]:
df = pd.read_csv("../data/raw/CTD_diseases.csv", sep=',')

In [15]:
df.head()

Unnamed: 0,DiseaseName,DiseaseID,AltDiseaseIDs,Definition,ParentIDs,TreeNumbers,ParentTreeNumbers,Synonyms,SlimMappings
0,10p Deletion Syndrome (Partial),MESH:C538288,,,MESH:D002872|MESH:D025063,C16.131.260/C538288|C16.320.180/C538288|C23.55...,C16.131.260|C16.320.180|C23.550.210.050.500.500,"Chromosome 10, 10p- Partial|Chromosome 10, mon...",Congenital abnormality|Genetic disease (inborn...
1,13q deletion syndrome,MESH:C535484,,,MESH:D002872|MESH:D025063,C16.131.260/C535484|C16.320.180/C535484|C23.55...,C16.131.260|C16.320.180|C23.550.210.050.500.500,Chromosome 13q deletion|Chromosome 13q deletio...,Congenital abnormality|Genetic disease (inborn...
2,15q24 Microdeletion,MESH:C579849,DO:DOID:0060395,,MESH:D002872|MESH:D008607|MESH:D025063,C10.597.606.360/C579849|C16.131.260/C579849|C1...,C10.597.606.360|C16.131.260|C16.320.180|C23.55...,15q24 Deletion|15q24 Microdeletion Syndrome|In...,Congenital abnormality|Genetic disease (inborn...
3,16p11.2 Deletion Syndrome,MESH:C579850,,,MESH:D001321|MESH:D002872|MESH:D008607|MESH:D0...,C10.597.606.360/C579850|C16.131.260/C579850|C1...,C10.597.606.360|C16.131.260|C16.320.180|C23.55...,,Congenital abnormality|Genetic disease (inborn...
4,"17,20-Lyase Deficiency, Isolated",MESH:C567076,,,MESH:D000312,C12.050.351.875.253.090.500/C567076|C12.200.70...,C12.050.351.875.253.090.500|C12.200.706.316.09...,"17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C...",Congenital abnormality|Endocrine system diseas...


In [57]:
# Connect to Neo4j
uri = "neo4j://localhost:7999/neo4j"
username = "neo4j"
password = "password"  # replace with your password

driver = GraphDatabase.driver(uri, auth=(username, password))


In [None]:
# Function to create disease nodes
def create_disease_nodes(tx, disease):
    tx.run("""
        MERGE (d:Disease {DiseaseID: $DiseaseID})
        SET d.DiseaseName = $DiseaseName, d.AltDiseaseIDs = $AltDiseaseIDs,
            d.Definition = $Definition, d.TreeNumbers = $TreeNumbers,
            d.ParentTreeNumbers = $ParentTreeNumbers, d.Synonyms = $Synonyms,
            d.SlimMappings = $SlimMappings
    """, 
    DiseaseID=disease['DiseaseID'],
    DiseaseName=disease['DiseaseName'],
    AltDiseaseIDs=disease['AltDiseaseIDs'],
    Definition=disease['Definition'],
    TreeNumbers=disease['TreeNumbers'],
    ParentTreeNumbers=disease['ParentTreeNumbers'],
    Synonyms=disease['Synonyms'],
    SlimMappings=disease['SlimMappings'])

In [19]:
# Function to create hierarchical relationships
def create_hierarchy(tx, disease):
    if pd.notna(disease['ParentIDs']):
        parent_ids = disease['ParentIDs'].split('|')
        for parent_id in parent_ids:
            tx.run("""
                MATCH (d:Disease {DiseaseID: $DiseaseID})
                MATCH (p:Disease {DiseaseID: $ParentID})
                MERGE (d)-[:SUB_CATEGORY_OF]->(p)
            """, DiseaseID=disease['DiseaseID'], ParentID=parent_id)

In [31]:
# Function to get disease descriptions
def get_disease_descriptions(tx):
    result = tx.run("""
        MATCH (d:Disease) 
        RETURN d.DiseaseID AS DiseaseID, d.DiseaseName AS DiseaseName, d.Definition AS Definition
    """)
    return result.data()

In [30]:
# Function to update disease embeddings
def update_disease_embeddings(tx, disease_id, embedding):
    tx.run("""
        MATCH (d:Disease {DiseaseID: $DiseaseID}) 
        SET d.DiseaseEmbedding = $embedding
    """, DiseaseID=disease_id, embedding=embedding)

In [17]:
with driver.session() as session:
    for _, row in df.iterrows():
        session.write_transaction(create_disease_nodes, row)

In [20]:
# Create hierarchical relationships
with driver.session() as session:
    for _, row in df.iterrows():
        session.write_transaction(create_hierarchy, row)

In [32]:
with driver.session() as session:
    disease_descriptions = session.read_transaction(get_disease_descriptions)

In [35]:
# Generate embeddings for the disease data
embeddings = []
for record in disease_descriptions:
    disease_id = record['DiseaseID']
    disease_name = record['DiseaseName']
    
    name_embedding = embed_model.get_text_embedding(disease_name)
    
    embeddings.append((disease_id, name_embedding))

In [36]:
with driver.session() as session:
    for disease_id, embedding in embeddings:
        session.write_transaction(update_disease_embeddings, disease_id, embedding)

In [None]:
# Close the driver connection
driver.close()

In [37]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="c516db64-8506-47f8-ba37-3d1e1e389d81")
pc.create_index(
    name="quickstart",
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [38]:
# pc.init(api_key="c516db64-8506-47f8-ba37-3d1e1e389d81", environment="us-east-1")
index = pc.Index("quickstart")

In [39]:
# Function to get disease embeddings
def get_disease_embeddings(tx):
    result = tx.run("""
        MATCH (d:Disease)
        RETURN d.DiseaseID AS DiseaseID, d.DiseaseEmbedding AS NameEmbedding
    """)
    return result.data()

# Retrieve embeddings from Neo4j
with driver.session() as session:
    disease_embeddings = session.read_transaction(get_disease_embeddings)

In [40]:
# Prepare and upload to Pinecone
vectors = []
for record in disease_embeddings:
    disease_id = record['DiseaseID']
    name_embedding = record['NameEmbedding']
    
    if name_embedding:
        vectors.append({
            'id': f"{disease_id}-name",
            'values': name_embedding
        })

index.upsert(vectors=vectors)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 28 Jun 2024 13:58:22 GMT', 'Content-Type': 'application/json', 'Content-Length': '119', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '51421', 'x-pinecone-request-id': '7515736688678031117', 'x-envoy-upstream-service-time': '1', 'server': 'envoy'})
HTTP response body: {"code":11,"message":"Error, message length too large: found 25824908 bytes, the limit is: 4194304 bytes","details":[]}


In [58]:
def find_similar_diseases(query_embedding):
    with driver.session() as session:
        query = """
        MATCH (d:Disease)
        WHERE d.DiseaseEmbedding IS NOT NULL
        WITH d, gds.similarity.cosine(d.DiseaseEmbedding, $query_embedding) AS similarity
        RETURN d.Disease AS name, similarity
        ORDER BY similarity DESC
        LIMIT 5
        """
        result = session.run(query, query_embedding=query_embedding)
        return [record["name"] for record in result]

In [59]:
query_embedding = embed_model.get_text_embedding("breast carcinoma")
similar_diseases = find_similar_diseases(query_embedding)
print(similar_diseases)

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'gds.similarity.cosine' (line 4, column 17 (offset: 88))
"        WITH d, gds.similarity.cosine(d.DiseaseEmbedding, $query_embedding) AS similarity"
                 ^}