In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
from typing import List

from langchain_community.document_loaders import TextLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from neo4j.exceptions import ClientError

import openai
from openai import OpenAI
from openai import AzureOpenAI

from time import sleep
#from string import Template
#import json
#from neo4j import GraphDatabase
#import glob
#from timeit import default_timer as timer
#from time import sleep
#import logging


In [2]:
# Load environment variables
load_dotenv(r"C:\Users\lriospie\OneDrive - azureford\Documents\Maestria\Integrador\Neo4j_RAG\.env",verbose=True)


True

In [3]:
# client = AzureOpenAI(
#     api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
#     api_version="2023-05-15",
#     azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
# )


AZURE_OPENAI_ENDPOINT=os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY=os.getenv("AZURE_OPENAI_API_KEY"), 

#deployment_name='text-embedding-3' #This will correspond to the custom name you chose for your deployment when you deployed a model.

# Embeddings & LLM models
embedding_dimension = 1536
embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3",api_version="2024-02-01",dimensions=embedding_dimension)

llm = AzureChatOpenAI(azure_deployment='chat_gtp_35',api_version="2023-05-15", temperature=0)

# Get Neo4j credentials from environment variables
NEO4J_URI=os.getenv("NEO4J_URI")
NEO4J_USERNAME=os.getenv("NEO4J_USER")
NEO4J_PASSWORD=os.getenv("NEO4J_PASSWORD")

graph = Neo4jGraph()
sleep(5)
graph.query("MATCH (n) DETACH DELETE n")

[]

In [4]:
# Load the text file
txt_path ="./News_canab.txt"
#loader = TextLoader(str(txt_path))
loader = TextLoader(str(txt_path), encoding='utf-8')
documents = loader.load()

# Ingest Parent-Child node pairs
parent_splitter = TokenTextSplitter(chunk_size=384, chunk_overlap=24)
child_splitter = TokenTextSplitter(chunk_size=75, chunk_overlap=24)
parent_documents = parent_splitter.split_documents(documents)

In [5]:
for i, parent in enumerate(parent_documents):
    child_documents = child_splitter.split_documents([parent])
    params = {
        "parent_text": parent.page_content,
        "parent_id": i,
        "parent_embedding": embeddings.embed_query(parent.page_content),
        "children": [
            {
                "text": c.page_content,
                "id": f"{i}-{ic}",
                "embedding": embeddings.embed_query(c.page_content),
            }
            for ic, c in enumerate(child_documents)
        ],
    }
    # Ingest data
    sleep(5)
    graph.query(
        """
    MERGE (p:Parent {id: $parent_id})
    SET p.text = $parent_text
    WITH p
    CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
    YIELD node
    WITH p 
    UNWIND $children AS child
    MERGE (c:Child {id: child.id})
    SET c.text = child.text
    MERGE (c)<-[:HAS_CHILD]-(p)
    WITH c, child
    CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
    YIELD node
    RETURN count(*)
    """,
        params,
    )
    # Create vector index for child
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('parent_document', "
            "'Child', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass
    # Create vector index for parents
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('typical_rag', "
            "'Parent', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass


In [6]:
# Ingest hypothethical questions
class Questions(BaseModel):
    """Generating hypothetical questions about text."""

    questions: List[str] = Field(
        ...,
        description=(
            "Generated hypothetical questions based on " "the information from the text"
        ),
    )


questions_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                "You are generating hypothetical questions based on the information "
                "found in the text and must reply in JSON format. Make sure to provide full context in the generated "
                "questions."
            ),
        ),
        (
            "human",
            (
                "Use the given format to generate hypothetical questions from the"
                "following input: {input}"
            ),
        ),
    ]
)

question_chain = questions_prompt | llm.with_structured_output(Questions.json)

In [7]:
# from langchain_core.pydantic_v1 import BaseModel, Field
# class Joke(BaseModel):
#     setup: str = Field(description="The setup of the joke")
#     punchline: str = Field(description="The punchline to the joke")
       
# structured_llm = llm.with_structured_output(Joke.json)


In [8]:
# structured_llm.invoke(
#     "Tell me a joke about cats, respond in JSON with `setup` and `punchline` keys"
# )

In [9]:
def clean_questions_data(questions_data):
    if 'questions' not in questions_data:
        return []

    cleaned_questions = []
    for entry in questions_data['questions']:
        if isinstance(entry, dict) and 'question' in entry:
            cleaned_questions.append(entry)
        elif isinstance(entry, str):
            cleaned_questions.append({'question': entry})
        else:
            # Handle cases where the entry is not a string or doesn't contain the 'question' key
            print(f"Invalid entry found and skipped: {entry}")
    
    return cleaned_questions

for i, parent in enumerate(parent_documents):
    questions_data = question_chain.invoke(parent.page_content)
    cleaned_questions_list = clean_questions_data(questions_data)  
    if 'questions' in questions_data:
        questions_list = questions_data['questions']  # Access the 'questions' list
        params = {
            "parent_id": i,
            "questions": []
        }
        
        for iq, q in enumerate(cleaned_questions_list):  # Use the cleaned questions list
            if isinstance(q, dict) and 'question' in q:
                try:
                    question_text = q['question']
                    embedding = embeddings.embed_query(question_text)
                    params["questions"].append({"text": question_text, "id": f"{i}-{iq}", "embedding": embedding})
                except Exception as e:
                    print(f"Error embedding question {q}: {e}")
            else:
                print(f"Skipping invalid question entry: {q}")
        
        sleep(5)
        graph.query(
            """
        MERGE (p:Parent {id: $parent_id})
        WITH p
        UNWIND $questions AS question
        CREATE (q:Question {id: question.id})
        SET q.text = question.text
        MERGE (q)<-[:HAS_QUESTION]-(p)
        WITH q, question
        CALL db.create.setVectorProperty(q, 'embedding', question.embedding)
        YIELD node
        RETURN count(*)
        """,
            params,
        )
        # Create vector index
        try:
            graph.query(
                "CALL db.index.vector.createNodeIndex('hypothetical_questions', "
                "'Question', 'embedding', $dimension, 'cosine')",
                {"dimension": embedding_dimension},
            )
        except ClientError:  # already exists
            pass



In [10]:
# Ingest summaries

summary_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                "You are generating concise and accurate summaries based on the "
                "information found in the text. Do not miss in the summaries any information" 
                "related with numbers such as dates, costs, rates, porcentages or any sort of numerical values "
            ),
        ),
        (
            "human",
            ("Generate a summary of the following input: {question}\n" "Summary:"),
        ),
    ]
)

summary_chain = summary_prompt | llm

In [11]:
for i, parent in enumerate(parent_documents):
    summary = summary_chain.invoke({"question": parent.page_content}).content
    params = {
        "parent_id": i,
        "summary": summary,
        "embedding": embeddings.embed_query(summary),
    }
    
    sleep(5)
    graph.query(
        """
    MERGE (p:Parent {id: $parent_id})
    MERGE (p)-[:HAS_SUMMARY]->(s:Summary)
    SET s.text = $summary
    WITH s
    CALL db.create.setVectorProperty(s, 'embedding', $embedding)
    YIELD node
    RETURN count(*)
    """,
        params,
    )
    # Create vector index
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('summary', "
            "'Summary', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass


In [12]:
from langchain_community.vectorstores import Neo4jVector



# Typical RAG retriever

typical_rag = Neo4jVector.from_existing_index(
    embeddings, index_name="typical_rag"
)

# Parent retriever

parent_query = """
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata LIMIT 1
"""

parent_vectorstore = Neo4jVector.from_existing_index(
    embeddings,
    index_name="parent_document",
    retrieval_query=parent_query,
)

# Hypothetic questions retriever

hypothetic_question_query = """
MATCH (node)<-[:HAS_QUESTION]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata
"""

hypothetic_question_vectorstore = Neo4jVector.from_existing_index(
    embeddings,
    index_name="hypothetical_questions",
    retrieval_query=hypothetic_question_query,
)
# Summary retriever

summary_query = """
MATCH (node)<-[:HAS_SUMMARY]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata
"""

summary_vectorstore = Neo4jVector.from_existing_index(
    embeddings,
    index_name="summary",
    retrieval_query=summary_query,
)

In [13]:
response = hypothetic_question_vectorstore.similarity_search(
    "What is The Department of Justice recommendation on marijuana"
)
print(response[0].page_content)

The Biden administration is poised to make a landmark change to the federal government's position on marijuana with a proposed plan that would no longer consider marijuana among the most dangerous and addictive substances. 

In what would be the biggest change in marijuana policy the federal government has taken since pot was first outlawed, the Drug Enforcement Administration will take public comments on a plan to recategorize marijuana under the Controlled Substances Act, according to a source familiar with the process. The news was first reported by The Associated Press.

The Department of Justice will send its recommendation to reclassify marijuana from a Schedule I drug to a Schedule III drug to the White House Office of Management and Budget, according to the source, who was not authorized to speak publicly. The Justice Department is expected to transmit the recommendation today, the source said.

The plan wouldn't legalize marijuana at the federal level outright, but it would re

In [14]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI

graph.refresh_schema()

vector_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=typical_rag.as_retriever()
)

vector_qa.invoke(
     "Which other drugs appear along with marijuana in the Schedule I"
)

{'query': 'Which other drugs appear along with marijuana in the Schedule I',
 'result': 'Marijuana has been listed under the Controlled Substances Act as a Schedule I drug, alongside heroin, LSD, and ecstasy.'}

In [15]:
graph.refresh_schema()

vector_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=typical_rag.as_retriever()
)

vector_qa.invoke(
    #"What Wexler previously told to USA TODAY"
    "What experts  exposed to USA TODAY on marijuana"
)

{'query': 'What experts  exposed to USA TODAY on marijuana',
 'result': "Experts exposed to USA TODAY that marijuana’s placement on Schedule I was not based on credible scientific evidence of its perils, but once it was listed, researchers and advocates faced a heavy burden trying to prove it shouldn’t face such stiff restrictions. They also said that rescheduling is a step forward, but it is not nearly enough, and there's no reason to keep cannabis in the Controlled Substances Act."}

In [16]:
from langchain.chains import GraphCypherQAChain

# graph = Neo4jGraph()
graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    cypher_llm = AzureChatOpenAI(azure_deployment='Chat_gpt_4',api_version="2023-05-15", temperature=0),
    qa_llm = AzureChatOpenAI(azure_deployment='chat_gtp_35',api_version="2023-05-15", temperature=0), graph=graph, verbose=True,
)

  warn_deprecated(


In [17]:
graph.refresh_schema()
cypher_chain.invoke(
     "How many times Schedule III is mentioned within the text"
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Parent)-[:HAS_CHILD|:HAS_QUESTION|:HAS_SUMMARY]->(n)
WHERE n.text CONTAINS 'Schedule III'
RETURN COUNT(n) AS ScheduleIIIMentions
[0m
Full Context:
[32;1m[1;3m[{'ScheduleIIIMentions': 15}][0m

[1m> Finished chain.[0m


{'query': 'How many times Schedule III is mentioned within the text',
 'result': 'Schedule III is mentioned 15 times within the text.'}

In [18]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType

graph = Neo4jGraph()
graph.refresh_schema()

In [19]:
tools = [
    Tool(
        name="Tasks",
        func=vector_qa.run,
        description="""Useful to reply to most of the queries as long as any sort of aggregation is not requested.
        Use full question as input.
        """,
    ),
    Tool(
        name="Graph",
        func=cypher_chain.run,
        description="""Usfeful when any sort of aggregation is requested in the query.
        Use full question as input.
        """,
    ),
]

mrkl = initialize_agent(
    tools, 
    AzureChatOpenAI(azure_deployment='Chat_gpt_4',api_version="2023-05-15", temperature=0),
    agent=AgentType.OPENAI_FUNCTIONS, verbose=True
)


  warn_deprecated(


In [20]:
response = mrkl.invoke("How many times Schedule III is mentioned within the text")
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Tasks` with `How many times Schedule III is mentioned within the text`


[0m[36;1m[1;3mSchedule III is mentioned 7 times within the text.[0m[32;1m[1;3mSchedule III is mentioned 7 times within the text.[0m

[1m> Finished chain.[0m
{'input': 'How many times Schedule III is mentioned within the text', 'output': 'Schedule III is mentioned 7 times within the text.'}


In [21]:
response = mrkl.invoke("Who might be affected with IQ loss due to marijuana consumption")
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Tasks` with `Who might be affected with IQ loss due to marijuana consumption`


[0m[36;1m[1;3mAccording to the National Institute on Drug Abuse, people who begin using marijuana at a young age may experience permanent IQ loss.[0m[32;1m[1;3mPeople who begin using marijuana at a young age may be affected with IQ loss, potentially experiencing permanent reductions in their IQ.[0m

[1m> Finished chain.[0m
{'input': 'Who might be affected with IQ loss due to marijuana consumption', 'output': 'People who begin using marijuana at a young age may be affected with IQ loss, potentially experiencing permanent reductions in their IQ.'}
