In [37]:
from dotenv import load_dotenv
import os

import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI

import openai

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [4]:
kg.refresh_schema()
print(textwrap.fill(kg.schema, 60))

Node properties are the following: Chunk {title: STRING,
textEmbedding: LIST, chunkId: STRING, source: STRING,
topics: LIST, chunkSeqId: INTEGER, text: STRING},Title
{title: STRING, source: STRING, topics: LIST, summary:
STRING},Topic {name: STRING} Relationship properties are the
following:  The relationships are the following: (:Chunk)-
[:NEXT]->(:Chunk),(:Chunk)-[:PART_OF]->(:Title),(:Title)-
[:RELATED_TO]->(:Topic),(:Title)-[:START]->(:Chunk)


## Example Quesries

### Cypher Queries

In [7]:
kg.query("""
MATCH (t:Title)-[:RELATED_TO]->(top:Topic)
WHERE top.name = 'data'
RETURN t
""")

[{'t': {'summary': 'In common usage, data (US: ; UK: ) is a collection of discrete or continuous values that convey information, describing the quantity, quality, fact, statistics, other basic units of meaning, or simply sequences of symbols that may be further interpreted formally.  A datum is an individual value in a collection of data. Data is usually organized into structures such as tables that provide additional context and meaning, and which may themselves be used as data in larger structures. Data may be used as variables in a computational process. Data may represent abstract ideas or concrete measurements.\nData is commonly used in scientific research, economics, and in virtually every other form of human organizational activity.  Examples of data sets include price indices (such as consumer price index), unemployment rates, literacy rates, and census data. In this context, data represents the raw facts and figures from which useful information can be extracted. \nData is col

In [8]:
kg.query("""
MATCH (t:Title)-[:RELATED_TO]->(top:Topic)
WHERE top.name = 'data'
RETURN t.summary
""")

[{'t.summary': 'In common usage, data (US: ; UK: ) is a collection of discrete or continuous values that convey information, describing the quantity, quality, fact, statistics, other basic units of meaning, or simply sequences of symbols that may be further interpreted formally.  A datum is an individual value in a collection of data. Data is usually organized into structures such as tables that provide additional context and meaning, and which may themselves be used as data in larger structures. Data may be used as variables in a computational process. Data may represent abstract ideas or concrete measurements.\nData is commonly used in scientific research, economics, and in virtually every other form of human organizational activity.  Examples of data sets include price indices (such as consumer price index), unemployment rates, literacy rates, and census data. In this context, data represents the raw facts and figures from which useful information can be extracted. \nData is collect

### Vector Embedddings

In [9]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai.embeddings.create(input=[text], model=model).data[0].embedding


In [18]:
def neo4j_vector_search(question):
    """Search for similar nodes using the Neo4j vector index"""

    question_embedding = get_embedding(question)
    
    vector_search_query = """
        CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) yield node, score
        RETURN score, node.text AS text
        """
    similar = kg.query(vector_search_query, 
                     params={
                      'question_embedding': question_embedding, 
                      'index_name':'chunks_embedding_vector', 
                      'top_k': 10})
    return similar

In [19]:
que = "what should I do prior to machine learning modeling?"
res = neo4j_vector_search(que)

In [20]:
res

[{'score': 0.7362073659896851,
  'text': "Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Recently, artificial neural networks have been able to surpass many previous approaches in performance.Machine learning approaches have been applied to many fields including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine. ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.\nThe mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (

In [24]:
def neo4j_vector_search_with_nebrs(question):
    """Search for similar nodes using the Neo4j vector index"""

    question_embedding = get_embedding(question)
    
    vector_search_query = """
        CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) YIELD node, score
        MATCH (node)-[:PART_OF]->(title:Title)
        OPTIONAL MATCH (node)-[:NEXT]->(nextChunk:Chunk)
        OPTIONAL MATCH (prevChunk:Chunk)-[:NEXT]->(node)
        RETURN score, 
               node.text AS text, 
               prevChunk.text AS prevText, 
               nextChunk.text AS nextText
        """
    similar = kg.query(vector_search_query, 
                     params={
                      'question_embedding': question_embedding, 
                      'index_name':'chunks_embedding_vector', 
                      'top_k': 1})
    return similar

In [25]:
que = "what should I do prior to machine learning modeling?"
res = neo4j_vector_search_with_nebrs(que)

In [26]:
res

[{'score': 0.7362073659896851,
  'text': "Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Recently, artificial neural networks have been able to surpass many previous approaches in performance.Machine learning approaches have been applied to many fields including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine. ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.\nThe mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (

## KG RAG Chain

In [83]:
CYPHER_GENERATION_TEMPLATE = """Task: Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
Node properties:
- Chunk {{title: STRING, textEmbedding: LIST, chunkId: STRING, source: STRING, topics: LIST, chunkSeqId: INTEGER, text: STRING}}
- Title {{title: STRING, source: STRING, topics: LIST, summary: STRING}}
- Topic {{name: STRING}}
Relationship properties:
- (:Chunk)-[:NEXT]->(:Chunk)
- (:Chunk)-[:PART_OF]->(:Title)
- (:Title)-[:RELATED_TO]->(:Topic)
- (:Title)-[:START]->(:Chunk)

Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
all topic names should be in snake case like machine_learning, data_science. Only use topic or title cypher query if asked about it
for general information use vector search.
Examples: Here are a few examples of generated Cypher statements for particular questions:

# What are documents available for machine learning?
MATCH (t:Title)-[:RELATED_TO]->(top:Topic) WHERE top.name = 'machine_learning'
RETURN t.title

# Retrieve the summaries of all titles related to a given topic data, including the source of each title?
MATCH (title:Title)-[:RELATED_TO]->(t:Topic)
WHERE t.name = 'data'
RETURN title.title AS Title, title.summary AS Summary, title.source AS Source

# Tell me the intro of all docs about topic machine learning?
MATCH (t:Topic)<-[:RELATED_TO]-(title:Title)-[:START]->(chunk:Chunk)
WHERE t.name = 'machine_learning'
RETURN title.title AS Title, chunk.text AS FirstChunkText

# tell me in detail about different machine learning algorithms?
CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) YIELD node, score
        MATCH (node)-[:PART_OF]->(title:Title)
        OPTIONAL MATCH (node)-[:NEXT]->(nextChunk:Chunk)
        OPTIONAL MATCH (prevChunk:Chunk)-[:NEXT]->(node)
        RETURN score, 
               node.text AS text, 
               prevChunk.text AS prevText, 
               nextChunk.text AS nextText

# what are some related topics I have in my knowledge graph about political idealogies and science and machine learning?
CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) YIELD node
MATCH (node)-[:PART_OF]->(:Title)-[:RELATED_TO]->(relatedTopic:Topic)
RETURN DISTINCT relatedTopic.name AS RelatedTopicNames

Make absolute sure that generated cypher queries are correct. The question is:
{question}"""

In [84]:
ANS_TEMPLATE = """ Based on the below context found from cypher query, provide answer to the question.

{context}

if you do not find answer from above context say you do not know and do not generate answer.
The Question is:
{question}"""

In [129]:
def kg_query(question):
    
    # cypher query:
    CYPHER_GENERATION_PROMPT = PromptTemplate(
        input_variables=["question"], 
        template=CYPHER_GENERATION_TEMPLATE
        )

    chain = LLMChain(llm=ChatOpenAI(temperature=0), prompt=CYPHER_GENERATION_PROMPT)
    cypher_response = chain.run({"question": question})
    cypher_query =  cypher_response.strip().replace("\n", " ")
    
    print(f"Generated Cypher Query: {cypher_query}")

    if "$index_name" in cypher_query and "$question_embedding" in cypher_query:
        question_embedding = get_embedding(question)
        kg_res = kg.query(cypher_query, params={
                              'question_embedding': question_embedding, 
                              'index_name':'chunks_embedding_vector', 
                              'top_k': 3})
    else:
        kg_res = kg.query(cypher_query)

    ANS_PROMPT = PromptTemplate(
        input_variables=["context", "question"], 
        template=ANS_TEMPLATE
        )

    ans_chain = LLMChain(llm=ChatOpenAI(temperature=0), prompt=ANS_PROMPT)
    answer = ans_chain.run({"question": question, "context":str(kg_res)})
    return answer

### Testing

In [134]:
que = "what are all docs about data?"
op = kg_query(que)

Generated Cypher Query: MATCH (t:Title)-[:RELATED_TO]->(top:Topic) WHERE top.name = 'data' RETURN t.title


In [135]:
print(op)

The passed context is a list containing a dictionary with the key 't.title' and value 'Data'. This suggests that there is a document or documents related to data.


In [136]:
kg.query("MATCH (t:Title)-[:RELATED_TO]->(top:Topic) WHERE top.name = 'data' RETURN t.title")

[{'t.title': 'Data'}]

In [137]:
que = "tell me in detail about different machine learning algorithms?"
op = kg_query(que)

Generated Cypher Query: MATCH (algo:Chunk)-[:PART_OF]->(title:Title)-[:RELATED_TO]->(topic:Topic {name: 'machine_learning'}) WHERE algo.text CONTAINS 'algorithm' RETURN title.title AS Title, algo.text AS AlgorithmDescription, algo.source AS Source, algo.topics AS Topics


In [138]:
print(op)

I'm sorry, but I cannot provide a detailed explanation of different machine learning algorithms in this format. However, I can recommend resources or provide a brief overview if you would like. Let me know how I can assist you further.
