# Embedding Generation and Prediction

## Generate Embeddings for All Possible Combinations

In [1]:
import pandas as pd
import numpy as np

In [2]:
#set variables according to docker instance
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"

In [3]:
#connect with driver
from neo4j import GraphDatabase
with GraphDatabase.driver(uri,auth=(username,password)) as driver:
  driver.verify_connectivity()

In [4]:
#setup GDS
from re import U
from graphdatascience import GraphDataScience

gds = GraphDataScience(uri, auth=(username,password))

In [5]:
#method that clears all graphs in memory - can only have a specific amount according to memory
def clearGraphs():
    driver.execute_query(
        """
        CALL gds.graph.list()
        YIELD graphName
        WITH collect(graphName) AS graphs
        UNWIND graphs AS graph
        CALL gds.graph.drop(graph) YIELD graphName
        RETURN graphName
        """
    )

In [25]:
import itertools

# Define the variations for each field
fields_variations = {
    "pregnancies" : ["low pregnancies", "normal pregnancies", "high pregnancies"],
    "glucose" : ["low glucose", "normal glucose", "high glucose"],
    "blood_pressure" : ["low blood pressure", "normal blood pressure", "high blood pressure"],
    "skin_thickness" : ["low skin thickness", "normal skin thickness", "high skin thickness"],
    "insulin" : ["low insulin", "normal insulin", "high insulin"],
    "bmi" : ["low bmi", "normal bmi", "high bmi"],
    "diabetes_pedigree_function" : ["low diabetes pedigree function", "normal diabetes pedigree function", "high diabetes pedigree function"],
    "age" : ["low age", "normal age", "high age"]
}

relationships = {
    "low pregnancies": "HAS_PREGNANCIES",
    "normal pregnancies": "HAS_PREGNANCIES",
    "high pregnancies": "HAS_PREGNANCIES",
    "low glucose": "HAS_GLUCOSE",
    "normal glucose": "HAS_GLUCOSE",
    "high glucose": "HAS_GLUCOSE",
    "low blood pressure": "HAS_BLOOD_PRESSURE",
    "normal blood pressure": "HAS_BLOOD_PRESSURE",
    "high blood pressure": "HAS_BLOOD_PRESSURE",
    "low skin thickness": "HAS_SKIN_THICKNESS",
    "normal skin thickness": "HAS_SKIN_THICKNESS",
    "high skin thickness": "HAS_SKIN_THICKNESS",
    "low insulin": "HAS_INSULIN",
    "normal insulin": "HAS_INSULIN",
    "high insulin": "HAS_INSULIN",
    "low bmi": "HAS_BMI",
    "normal bmi": "HAS_BMI",
    "high bmi": "HAS_BMI",
    "low diabetes pedigree function": "HAS_DIABETES_PEDIGREE_FUNCTION",
    "normal diabetes pedigree function": "HAS_DIABETES_PEDIGREE_FUNCTION",
    "high diabetes pedigree function": "HAS_DIABETES_PEDIGREE_FUNCTION",
    "low age": "HAS_AGE",
    "normal age": "HAS_AGE",
    "high age": "HAS_AGE"
}

In [26]:
def generate_combinations(fields_variations, combination_size):
    # Get all field names
    field_names = list(fields_variations.keys())

    # Get all possible combinations of the specified size
    field_combinations = list(itertools.combinations(field_names, combination_size))

    # Function to generate all combinations of variations for a given set of fields
    def get_variations(fields):
        variations = [fields_variations[field] for field in fields]
        return list(itertools.product(*variations))

    # Generate all combinations for each set of fields
    all_combinations = []
    for fields in field_combinations:
        combinations = get_variations(fields)
        all_combinations.extend(combinations)

    # Convert to list of lists
    combinations_list = [list(combination) for combination in all_combinations]

    return combinations_list

In [27]:
generate_combinations(fields_variations, 3) #example usage

[['low pregnancies', 'low glucose', 'low blood pressure'],
 ['low pregnancies', 'low glucose', 'normal blood pressure'],
 ['low pregnancies', 'low glucose', 'high blood pressure'],
 ['low pregnancies', 'normal glucose', 'low blood pressure'],
 ['low pregnancies', 'normal glucose', 'normal blood pressure'],
 ['low pregnancies', 'normal glucose', 'high blood pressure'],
 ['low pregnancies', 'high glucose', 'low blood pressure'],
 ['low pregnancies', 'high glucose', 'normal blood pressure'],
 ['low pregnancies', 'high glucose', 'high blood pressure'],
 ['normal pregnancies', 'low glucose', 'low blood pressure'],
 ['normal pregnancies', 'low glucose', 'normal blood pressure'],
 ['normal pregnancies', 'low glucose', 'high blood pressure'],
 ['normal pregnancies', 'normal glucose', 'low blood pressure'],
 ['normal pregnancies', 'normal glucose', 'normal blood pressure'],
 ['normal pregnancies', 'normal glucose', 'high blood pressure'],
 ['normal pregnancies', 'high glucose', 'low blood press

In [None]:
#creates mimic sample with a specified SampleNumber of -1
driver.execute_query( 
    """
    Create(mimic:Sample {SampleNumber: -1})
    """

)

In [None]:
#get node id of mimic sample
nodeId = driver.execute_query(
    """
    Match (m:Sample) where m.SampleNumber=-1
    return id(m)
    """
)[0][0][0]

In [30]:
#removes all relationships for mimic
def clearrels():
    driver.execute_query(
        """
        match (n:Sample)-[r]->(m) where n.SampleNumber=-1 delete r
        """

    )

In [31]:
#creates specified relationship between mimic and other node
def createrels(relationship, node):
    query = f"""
    MATCH (n:Sample {{SampleNumber: -1}}), (m:medical_concept {{name: $nodename}})
    MERGE (n)-[r:{relationship}]->(m)
    """
    driver.execute_query(
        query,
        nodename=node
    )


In [32]:
#projects graph with passed in graphName
def createGraph(name):
    driver.execute_query(
        """
        CALL gds.graph.project(
            $graphName,
            {
                Sample: '*',
                medical_concept: '*',
                definition: '*',
                synonyms: '*'
            },
            {
                HAS_PREGNANCIES: {
                orientation: 'UNDIRECTED'  
                }, 
                HAS_GLUCOSE: {
                orientation: 'UNDIRECTED'  
                },
                HAS_BLOOD_PRESSURE: {
                orientation: 'UNDIRECTED'  
                },
                HAS_SKIN_THICKNESS: {
                orientation: 'UNDIRECTED'  
                },
                HAS_INSULIN: {
                orientation: 'UNDIRECTED'  
                }, 
                HAS_BMI: {
                orientation: 'UNDIRECTED'  
                },
                HAS_DIABETES_PEDIGREE_FUNCTION: {
                orientation: 'UNDIRECTED'  
                },
                HAS_AGE: {
                orientation: 'UNDIRECTED'  
                }, 
                Definition : {
                orientation: 'UNDIRECTED'  
                },
                Synonym: {
                orientation: 'UNDIRECTED'  
                },
                embedding_match_node: {
                orientation: 'UNDIRECTED'  
                }
            })
        """,
        graphName=name
    )


In [33]:
#gets fastrp embedding for mimic
def getembedding(gname):
    results = driver.execute_query(
        """
        CALL gds.fastRP.stream($graphName, {
            embeddingDimension: $embeddingDimension,
            randomSeed:42
        })
        YIELD nodeId, embedding
        WHERE nodeId = $nodeId
        return nodeId, embedding;
        """,
        embeddingDimension=128, graphName=gname, nodeId=nodeId
    )

    return results[0][0][1]

In [34]:
#loops through combination list and gets all embeddings for the mimic in a dictionary 
def getembeddingformimic(clist):
    retdict = {} #empty dictionary to hold all resulting embeddings
    for i,c in enumerate(clist):
       if i%10==0: #if divisible by 10, we delete all graphs in memory 
           clearGraphs()
       graphname = f"pima{i%10}"
       clearrels() #clear relationships
       for node in c: 
           relationship = relationships[node]
           createrels(relationship, node) #create relationship according to combination
       createGraph(graphname) #create graph
       embedding = getembedding(graphname)
       retdict[str(c)]= embedding #add resulting embedding and combination to dictionary
    
    return retdict
           

In [None]:
#main - applies conmbinations to graph and writes to file 
import csv 
for i in range(8):
    combinations = generate_combinations(fields_variations, i+1)
    embeddings = getembeddingformimic(combinations)
    with open("data/finalresults.csv", 'a', newline='') as file:
        writer = csv.writer(file)

        for key, value in embeddings.items():
            writer.writerow([key, value])
    clearGraphs()   

In [None]:
clearGraphs()

In [None]:
clearrels()

## Applying Embeeddings to Models

In [8]:
#load combinations and their embeddings 
embdf = pd.read_csv("data/finalresults.csv")
embdf.head()

Unnamed: 0,relationships,embedding
0,['low pregnancies'],"[0.3559499979019165, 0.017154032364487648, -0...."
1,['normal pregnancies'],"[0.21517100930213928, 0.09962455183267593, 0.1..."
2,['high pregnancies'],"[0.1451621651649475, 0.012034144252538681, -0...."
3,['low glucose'],"[0.07380708307027817, 0.07721707224845886, 0.0..."
4,['normal glucose'],"[0.4024839997291565, -0.0163387693464756, -0.0..."


In [15]:
import ast
import joblib
embdf['embedding'] = embdf['embedding'].apply(ast.literal_eval) #convert embeddings to lists (instead of strings)

In [16]:
input = np.array(embdf['embedding'].tolist()) #aggregate embedding column

### Random Forest

In [13]:
#load best rf model
with open('final_models/rfemb.pkl', 'rb') as file:
    model = joblib.load(file)

In [None]:
#predict
pred = model.predict(input)

In [18]:
#save prediction to embdf
embdf['score_rf'] = pred

### XGBoost

In [19]:
#load best xgb model
with open('final_models/xgbemb.pkl', 'rb') as file:
    model = joblib.load(file)

In [20]:
#predict
pred = model.predict(input)

In [21]:
#save prediction to embdf
embdf['score_xgb'] = pred

### SVM

In [22]:
#load best svm model
with open('final_models/svmemb.pkl', 'rb') as file:
    model = joblib.load(file)

In [None]:
#predict
pred = model.predict(input)

In [24]:
#save prediction to embdf
embdf['score_svm'] = pred

### Naive Bayes

In [25]:
#load best nb model
with open('final_models/nbemb.pkl', 'rb') as file:
    model = joblib.load(file)

In [None]:
#predict
pred = model.predict(input)

In [27]:
#save prediction to embdf
embdf['score_nb'] = pred

### Neural Networks

In [28]:
from tensorflow.keras.models import load_model

#load best nn model
model = load_model('final_models/nnemb.keras')

In [29]:
#predict
pred = model.predict(input)

[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 772us/step


In [30]:
#save prediction to embdf
embdf['score_nn'] = pred

In [31]:
#view embdf
embdf.head()

Unnamed: 0,relationships,embedding,score_rf,score_xgb,score_svm,score_nb,score_nn
0,['low pregnancies'],"[0.3559499979019165, 0.017154032364487648, -0....",0,0,0,0,0.170097
1,['normal pregnancies'],"[0.21517100930213928, 0.09962455183267593, 0.1...",0,0,0,0,0.298908
2,['high pregnancies'],"[0.1451621651649475, 0.012034144252538681, -0....",0,0,0,0,0.517344
3,['low glucose'],"[0.07380708307027817, 0.07721707224845886, 0.0...",0,0,0,0,0.121242
4,['normal glucose'],"[0.4024839997291565, -0.0163387693464756, -0.0...",0,0,0,0,0.048572


In [32]:
#create binary column for nn
embdf['score_nn_binary'] = embdf['score_nn'].apply(lambda x: 1 if x>=.5 else 0)

In [33]:
#save embdf
embdf.to_csv("data/predictedscores.csv")