# Summary

This notebook contains steps to

*   Use Stanford Core NLP to build entities and relationships for Anatomy Book
*   Setup Neo4J Graph DB locally and load the entities and relationships
*   Given a question, extracts all the relationships for entities in that question
*   Connect to OpenAI
*   Construct a Prompt. The Prompt should include the question as well as the entity relationships that were extracted








# Install the StanfordCoreNLP Dependencies

In [1]:
! pip install stanfordcorenlp
! pip install pycorenlp



# Start and Connect to the StanfordCoreNLP Server

In [2]:
#before executing this code make sure to start the server
# cd stanford-corenlp-full-2018-10-05
# java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000


import os
from pycorenlp import StanfordCoreNLP

# Start Stanford CoreNLP server
nlp = StanfordCoreNLP('http://localhost:9000')

# Utility Functions to extract the entities and relationships

In [3]:
def extract_entities(text):
    props = {'annotators': 'tokenize,ssplit,pos,lemma,ner',
             'pipelineLanguage': 'en',
             'outputFormat': 'json'}
    result = nlp.annotate(text, properties=props)
    # Process the output to extract entities
    entities = []
    try:
        import json
        output = json.loads(result)
        for sentence in output['sentences']:
            for entity in sentence['entitymentions']:
                entities.append({'text': entity['text'], 'label': entity['ner']})
    except json.JSONDecodeError as e:
        print(f"Error processing JSON output from Stanford CoreNLP: {e}")

    return entities


In [4]:
def extract_relationships(text):
    props = {'annotators': 'tokenize,ssplit,pos,depparse,openie',
             'pipelineLanguage': 'en'}
    result = nlp.annotate(text, properties=props)
    # Process the output to extract relationships
    relationships = []
    try:
        import json
        output = json.loads(result)
        for sentence in output['sentences']:
            for relation in sentence.get('openie', []):
                relationships.append({
                    'subject': relation['subject'],
                    'relation': relation['relation'],
                    'object': relation['object']
                })
    except:
        print("Error processing JSON output from Stanford CoreNLP.")

    return relationships

In [7]:
text = "John Doe, was diagnosed with Type 2 diabetes mellitus (T2DM) and hypertension by Dr. Smith at St. John's Hospital"
entities = extract_entities(text)
relationships = extract_relationships(text)

print("Entities:", entities)
print("Relationships:", relationships)

Error processing JSON output from Stanford CoreNLP: Expecting value: line 1 column 1 (char 0)
Entities: []
Relationships: [{'subject': 'John Doe', 'relation': 'was', 'object': 'diagnosed'}, {'subject': 'John Doe', 'relation': 'was diagnosed with', 'object': 'Type 2 diabetes mellitus'}, {'subject': 'John Doe', 'relation': 'was', 'object': "diagnosed with Type 2 diabetes mellitus at St. John 's Hospital"}, {'subject': 'St. John', 'relation': 'at', 'object': 'Hospital'}, {'subject': 'John Doe', 'relation': 'was diagnosed with', 'object': 'T2DM'}, {'subject': 'John Doe', 'relation': 'was diagnosed at', 'object': "St. John 's Hospital"}]


# Install Neo4J

In [5]:
! pip install neo4j






# Neo4J Utility Methods

In [6]:
# Please make sure to download Neo4j and start it locally. The Uri and the password will be available once Neo4J is started

from neo4j import GraphDatabase

# Connect to Neo4j
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "hello1234"))

# Create nodes and relationships in Neo4j
def create_graph(tx, entities, relationships):
    # Add entities as nodes
    #for entity in entities:
    #    print(entity["text"])
    # tx.run("MERGE (e:Entity {name: $name})", name=entity["text"])
    for rel in relationships:
        tx.run("MERGE (e:Entity {name: $name})", name=rel["subject"])
        tx.run("MERGE (e:Entity {name: $name})", name=rel["object"])

    # Add relationships as edges
    for rel in relationships:
        tx.run("""
        MATCH (e1:Entity {name: $source}), (e2:Entity {name: $target})
        MERGE (e1)-[:RELATIONSHIP {type: $relation}]->(e2)
        """, source=rel["subject"], target=rel["object"], relation=rel["relation"])

# Populate the graph
def build_neo4j_graph(entities, relationships):
    with driver.session() as session:
        session.write_transaction(create_graph, entities, relationships)

In [7]:
def print_neo4j_graph():
    """
    Prints the nodes and relationships in a Neo4j graph.

    Args:
        uri: The URI of the Neo4j database.
        auth: Authentication credentials (username, password).
    """
    try:
        with driver.session() as session:
            # Get all nodes
            result = session.run("MATCH (n) RETURN n")
            nodes = [record["n"] for record in result]

            # Get all relationships
            result = session.run("MATCH ()-[r]->() RETURN r")
            relationships = [record["r"] for record in result]

            # Print nodes
            print("Nodes:")
            for node in nodes:
                print(f"  - {node.id}: {node.labels} - {node.get('properties', {})}") # Print Node ID, labels and properties

            # Print relationships
            print("\nRelationships:")
            for relationship in relationships:
                print(f"  - {relationship.id}: {relationship.type} - ({relationship.start_node.id}) -> ({relationship.end_node.id}) - {relationship.get('properties', {})}") # Print relationship ID, type, start and end node and properties

    except Exception as e:
        print(f"Error: {e}")


In [8]:
def query_graph(entity_name):
    with driver.session() as session:
        query = """
        MATCH (e1:Entity {name: $name})-[r]->(e2:Entity)
        RETURN e1.name AS source, r.type AS relation, e2.name AS target
        """
        results = session.run(query, name=entity_name)
        return [{"source": record["source"], "relation": record["relation"], "target": record["target"]} for record in results]

In [None]:
def delete_graph():
  with driver.session() as session:
    query = """
    MATCH (n) DETACH DELETE n
    """
    session.run(query)

# Extract Entities and relationships from the Anatomy Book and load in Neo4J Graph

In [None]:
! pip install datasets



In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files=r"C:\\Users\\aliim\projects\\files\\AI_Healthcare\\textbooks\\chunk\\Anatomy_Gray.jsonl", split='train')
index = 0
for entry in dataset:
    relationships = extract_relationships(entry["content"])
    build_neo4j_graph(None, relationships)
    print("Done processing entry " + str(index))
    index+=1

[{'subject': 'Anatomy', 'relation': 'includes', 'object': 'structures'}, {'subject': 'term anatomy', 'relation': 'mean', 'object': 'gross anatomy'}, {'subject': 'term anatomy', 'relation': 'used by', 'object': 'itself'}, {'subject': 'term anatomy', 'relation': 'mean', 'object': 'anatomy'}, {'subject': 'Microscopic anatomy', 'relation': 'is study of', 'object': 'cells'}, {'subject': 'anatomy', 'relation': 'also called', 'object': 'histology'}, {'subject': 'anatomy', 'relation': 'called', 'object': 'histology'}, {'subject': 'study', 'relation': 'using', 'object': 'microscope'}, {'subject': 'anatomy', 'relation': 'is study of', 'object': 'cells'}, {'subject': 'Microscopic anatomy', 'relation': 'is', 'object': 'study'}, {'subject': 'Microscopic anatomy', 'relation': 'called', 'object': 'histology'}, {'subject': 'Microscopic anatomy', 'relation': 'also called', 'object': 'histology'}, {'subject': 'anatomy', 'relation': 'is', 'object': 'study'}, {'subject': 'Anatomy', 'relation': 'leads phys

In [9]:
! pip install openai



# Open AI Utility methods

In [None]:
import openai

# Set your OpenAI API key
openai.api_key = ""

def query_openai(prompt, model="gpt-4o", temperature=0.7, max_tokens=200):
    """
    Send a query to OpenAI's LLM and return the response.

    Parameters:
    - prompt (str): The query or instruction for the LLM.
    - model (str): The model to use (e.g., "gpt-4", "gpt-3.5-turbo").
    - temperature (float): Controls randomness (0.0 for deterministic, 1.0 for creative).
    - max_tokens (int): Maximum number of tokens to generate in the response.

    Returns:
    - str: The LLM response.
    """
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )
        return response.choices[0].message.content
    except openai.error.OpenAIError as e:
        return f"Error: {e}"

# Example: Ask Question, Extract Entity and Relationships, Ask OpenAI for response

In [None]:
# Example query
question = """
A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral

A. paralysis of the facial muscles.
B. paralysis of the facial muscles and loss of taste.
C. paralysis of the facial muscles, loss of taste and lacrimation.
D. paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.

"""

entities = extract_entities(question)
relationships = extract_relationships(question)

entity_list = []
for entity in entities:
  entity_list.append(entity["text"])
for relationship in relationships:
  entity_list.append(relationship["subject"])
  entity_list.append(relationship["object"])

entityInfo = []
for entity in entity_list:
  result = query_graph(entity)
  for r in result:
    entityInfo.append(r["source"] + " " + r["relation"] + " " + r["target"] + "\n")

prompt = question + "\nPlease use only the following information in the prompt to answer the question. Also please specify whether information from any other sources was used.\n" + "".join(entityInfo)
print(prompt)

response = query_openai(prompt)
print("Response:", response)


A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral

A. paralysis of the facial muscles.
B. paralysis of the facial muscles and loss of taste.
C. paralysis of the facial muscles, loss of taste and lacrimation.
D. paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.


Please use only the following information in the prompt to answer the question. Also please specify whether information from any other sources was used.
lesion is proximal
lesion is in brain
lesion was on side
lesion results in eye
lesion be along course
lesion is likely affected
lesion possibly caused by hypersensitivity response to release
lesion caused by hypersensitivity response to release
lesion may develop In minority
lesion possibly caused by hypersensitivity response to release in tissues
lesion caused by hypersensitivity response to release in tissues
lesion possibly caused by hypersensitivity response
lesion caused by hypersensitiv

In [11]:
import os

# Define the directory path
directory_path = "./anatomy_questions"
output_directory = "./anatomy_answers_stanford_nlp"
prompt_directory = "./anatomy_stanford_nlp_open_ai_prompts"


# Loop through each file in the directory
for file_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file_name)
    prompt_file_path = os.path.join(prompt_directory, file_name)
    output_file_path = os.path.join(output_directory, file_name)
    # Check if it's a file
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            print("processing " + file_path)
            question = file.read()
            entities = extract_entities(question)
            relationships = extract_relationships(question)
            print("got entities")
            entity_list = []
            for entity in entities:
                entity_list.append(entity["text"])
            for relationship in relationships:
                entity_list.append(relationship["subject"])
                entity_list.append(relationship["object"])

            entityInfo = []
            for entity in entity_list:
                result = query_graph(entity)
                for r in result:
                    entityInfo.append(r["source"] + " " + r["relation"] + " " + r["target"] + "\n")
            print("got relationships from graph")
            prompt = question + "\nPlease use only the information below to answer the question. Please do not use information available from general knowledge to answer this question.\n" + "".join(entityInfo)
            # truncate the prompt
            prompt = prompt[:10000]  # Truncate to 10000 characters to control the size of the prompt
            with open(prompt_file_path, 'w') as prompt_file:
                prompt_file.write(prompt)
            
            response = query_openai(prompt)
            with open(output_file_path, 'w') as output_file:
                output_file.write(response)
                
            print("Done processing " + file_path)


processing ./anatomy_questions\test_anatomy-question_000.txt
got entities
got relationships from graph
Done processing ./anatomy_questions\test_anatomy-question_000.txt
processing ./anatomy_questions\test_anatomy-question_001.txt
got entities
got relationships from graph
Done processing ./anatomy_questions\test_anatomy-question_001.txt
processing ./anatomy_questions\test_anatomy-question_002.txt
got entities
got relationships from graph
Done processing ./anatomy_questions\test_anatomy-question_002.txt
processing ./anatomy_questions\test_anatomy-question_003.txt
got entities
got relationships from graph
Done processing ./anatomy_questions\test_anatomy-question_003.txt
processing ./anatomy_questions\test_anatomy-question_004.txt
got entities
got relationships from graph
Done processing ./anatomy_questions\test_anatomy-question_004.txt
processing ./anatomy_questions\test_anatomy-question_005.txt
got entities
got relationships from graph
Done processing ./anatomy_questions\test_anatomy-que

In [14]:

# compare the answers
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)


source = pd.read_csv("./anatomy_test.csv", header=None)
answer_key = source.iloc[:,5]
graph_rag_answers = pd.read_csv("./anatomy_answers_stanford_nlp/answers.csv")
graph_rag_answers = graph_rag_answers.replace(np.nan, '', regex=True)
graph_rag_answer_key = graph_rag_answers.iloc[:,1]
question = graph_rag_answers.iloc[:,0]
new_df = pd.DataFrame({'question': question, 'correct_answer': answer_key, 'graph_rag_answer': graph_rag_answer_key})
new_df['is_graph_rag_answer_correct'] = new_df['correct_answer'] == new_df['graph_rag_answer']
new_df['graph_rag_answer_comment'] = graph_rag_answers['Comments']
correct_count = new_df['is_graph_rag_answer_correct'].sum()
print(correct_count)
new_df.to_csv('./stanford_nlp_graph_answers_comparison_to_answer_key.csv', index=False)

53
