In [1]:
### Neo4j
from neo4j import GraphDatabase
from neo4j import  Driver
### Langchain
from langchain_neo4j import Neo4jGraph
from langchain_neo4j import Neo4jVector
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, CSVLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
### ChatModels (https://python.langchain.com/docs/integrations/chat/)
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
### Embeddings
from langchain_ollama import OllamaEmbeddings
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_experimental.graph_transformers import LLMGraphTransformer

from yfiles_jupyter_graphs import GraphWidget
from pydantic import BaseModel, Field

import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
os.getcwd()

'z:\\Git_PhD\\GraphRAG'

## Data Loader

In [6]:
loader = CSVLoader(file_path="data/customers-100.csv",   csv_args={
        "fieldnames": ["Customer Id", "First Name", "Last Name", "Company", "City", "Country"],
    },)
# loader = CSVLoader(file_path="data/customers-100.csv")
loader1 = TextLoader(file_path="data/sparql.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=24)
documents = text_splitter.split_documents(documents=docs)

In [10]:
docs

[Document(metadata={'source': 'data/customers-100.csv', 'row': 0}, page_content='Customer Id: Index\nFirst Name: Customer Id\nLast Name: First Name\nCompany: Last Name\nCity: Company\nCountry: City\nNone: Country,Phone 1,Phone 2,Email,Subscription Date,Website'),
 Document(metadata={'source': 'data/customers-100.csv', 'row': 1}, page_content='Customer Id: 1\nFirst Name: DD37Cf93aecA6Dc\nLast Name: Sheryl\nCompany: Baxter\nCity: Rasmussen Group\nCountry: East Leonard\nNone: Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/'),
 Document(metadata={'source': 'data/customers-100.csv', 'row': 2}, page_content='Customer Id: 2\nFirst Name: 1Ef7b82A4CAAD10\nLast Name: Preston\nCompany: Lozano\nCity: Vega-Gentry\nCountry: East Jimmychester\nNone: Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/'),
 Document(metadata={'source': 'data/customers-100.csv', 'row': 3}, page_content='Customer Id: 3\nFirst Name: 6F9487

In [5]:
docs[77]

Document(metadata={'source': 'data/customers-100.csv', 'row': 77}, page_content='Index: 77\nCustomer Id: BF6a1f9bd1bf8DE\nFirst Name: Brittany\nLast Name: Zuniga\nCompany: Mason-Hester\nCity: West Reginald\nCountry: Kyrgyz Republic\nPhone 1: (050)136-9025\nPhone 2: 001-480-851-2496x0157\nEmail: mchandler@cochran-huerta.org\nSubscription Date: 2021-07-24\nWebsite: http://www.boyle.com/')

In [64]:
docs[77].page_content

'Index: 78\nCustomer Id: FfaeFFbbbf280db\nFirst Name: Cassidy\nLast Name: Mcmahon\nCompany: Mcguire, Huynh and Hopkins\nCity: Lake Sherryborough\nCountry: Myanmar\nPhone 1: 5040771311\nPhone 2: 684-682-0021x1326\nEmail: katrinalane@fitzgerald.com\nSubscription Date: 2020-10-21\nWebsite: https://hurst.com/'

In [11]:
documents

[Document(metadata={'source': 'data/customers-100.csv', 'row': 0}, page_content='Customer Id: Index\nFirst Name: Customer Id\nLast Name: First Name\nCompany: Last Name\nCity: Company\nCountry: City\nNone: Country,Phone 1,Phone 2,Email,Subscription Date,Website'),
 Document(metadata={'source': 'data/customers-100.csv', 'row': 1}, page_content='Customer Id: 1\nFirst Name: DD37Cf93aecA6Dc\nLast Name: Sheryl\nCompany: Baxter\nCity: Rasmussen Group\nCountry: East Leonard\nNone: Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/'),
 Document(metadata={'source': 'data/customers-100.csv', 'row': 2}, page_content='Customer Id: 2\nFirst Name: 1Ef7b82A4CAAD10\nLast Name: Preston\nCompany: Lozano\nCity: Vega-Gentry\nCountry: East Jimmychester\nNone: Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/'),
 Document(metadata={'source': 'data/customers-100.csv', 'row': 3}, page_content='Customer Id: 3\nFirst Name: 6F9487

In [6]:
llm_type = os.getenv("LLM_TYPE", "ollama")
llm_type 

'ollama'

In [10]:
if llm_type == "ollama":
    llm = ChatOllama(model="llama3.2", temperature=0)
    print("Ollama LLM is available.")
elif llm_type == "openai":
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    print("OpenAI.")
else:
    llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0)
    print("Claude Shannon here")

llm_transformer = LLMGraphTransformer(llm=llm)
graph_documents = llm_transformer.convert_to_graph_documents(documents)

Ollama LLM is available.


: 

: 

In [38]:
graph_documents[3]

GraphDocument(nodes=[], relationships=[], source=Document(metadata={'source': 'data/sparql.txt'}, page_content='PREFIX :\t<https://github.com/aghoshpro/OntoRaster/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX geo:\t <http://www.opengis.net/ont/geosparql#>\nPREFIX geof: <http://www.opengis.net/def/function/geosparql/>'))

In [37]:
len(graph_documents)

9

In [39]:
graph = Neo4jGraph()
graph

<langchain_neo4j.graphs.neo4j_graph.Neo4jGraph at 0x197d019ebf0>

In [40]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [41]:
def displayGraph():
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))

    session = driver.session()
    widget = GraphWidget(graph=session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t").graph())
    widget.node_label_mapping = 'id'
    return widget

# displayGraph()

In [42]:
displayGraph()

GraphWidget(layout=Layout(height='500px', width='100%'))

In [None]:
# import neo4jupyter
# neo4jupyter.init_notebook_mode()

<IPython.core.display.Javascript object>

In [47]:
# neo4jupyter.draw(graph,{"Nodes_type": "Att"})

### Embedding Model

- https://medium.com/timescale/finding-the-best-open-source-embedding-model-for-rag-929d1656d331

In [48]:
EMBEDDING_MODELS = [
    {'name':'mxbai-embed-large', 'dimensions': 1024},
    {'name':'nomic-embed-text','dimensions': 768},
    {'name':'bge-m3','dimensions': 1024},
]
 
for model in EMBEDDING_MODELS:
    print(model['name'], model['dimensions'])

mxbai-embed-large 1024
nomic-embed-text 768
bge-m3 1024


In [49]:
embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

In [50]:
vector_retriever

VectorStoreRetriever(tags=['Neo4jVector', 'OllamaEmbeddings'], vectorstore=<langchain_neo4j.vectorstores.neo4j_vector.Neo4jVector object at 0x00000197D00CD240>, search_kwargs={})

In [51]:
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))

def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX `fulltext_entity_id` 
    FOR (n:__Entity__) 
    ON EACH [n.id];
    '''
    tx.run(query)

# Function to execute the query
def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext is indexed successfully.")

# Call the function to create the index
try:
    create_index()
except:
    pass

# Close the driver connection
driver.close()

In [52]:
class Entities(BaseModel):
    """Identifying information about entities."""

    names: list[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)


entity_chain = llm.with_structured_output(Entities)

In [53]:
entity_chain

RunnableBinding(bound=ChatOllama(model='llama3.2', temperature=0.0), kwargs={'tools': [{'type': 'function', 'function': {'name': 'Entities', 'description': 'Identifying information about entities.', 'parameters': {'properties': {'names': {'description': 'All the person, organization, or business entities that appear in the text', 'items': {'type': 'string'}, 'type': 'array'}}, 'required': ['names'], 'type': 'object'}}}], 'structured_output_format': {'kwargs': {'method': 'function_calling'}, 'schema': {'type': 'function', 'function': {'name': 'Entities', 'description': 'Identifying information about entities.', 'parameters': {'properties': {'names': {'description': 'All the person, organization, or business entities that appear in the text', 'items': {'type': 'string'}, 'type': 'array'}}, 'required': ['names'], 'type': 'object'}}}}}, config={}, config_factories=[])
| PydanticToolsParser(first_tool_only=True, tools=[<class '__main__.Entities'>])

In [42]:
entity_chain.invoke("Who are Nonna Lucia and Giovanni Caruso?")

Entities(names=['Nonna Lucia', 'Giovanni Caruso'])

In [54]:
def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()


# Fulltext index query
def graph_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke(question)
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('fulltext_entity_id', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [26]:
print(graph_retriever("Who is Bruce	Esparza?"))






In [23]:
graph_retriever("Who is Giovanni?")

''

In [55]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data:
{graph_data}
vector data:
{"#Document ". join(vector_data)}
    """
    return final_data

## Q&A

In [56]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
        {
            "context": full_retriever,
            "question": RunnablePassthrough(),
        }
    | prompt
    | llm
    | StrOutputParser()
)

In [57]:
chain.invoke(input="What is this?")

'This appears to be a query written in SPARQL, a query language for RDF data.'

In [58]:
chain.invoke(input="Write a SPARQL code to find all districts of Bolzano where elevation is between 510-520 meters?")

'To find all districts of Bolzano with an elevation between 510-520 meters, you can use the following SPARQL query:\n\n```sparql\nSELECT ?distName ?elevation \nWHERE {\n  ?region rdfs:label ?distName .\n  ?region geo:asWKT ?distWkt .\n  FILTER (CONTAINS(?distWkt, \'Bolzano\') AND CONTAINS(?distWkt, \'510-520\'))\n}\n```\n\nThis query filters the districts where the elevation is between 510-520 meters and contains the label "Bolzano".'

In [None]:
chain.invoke(input="Who company is Bruce Esparza from? and is he has an email ?")



"I don't have enough information to determine who Bruce Esparza is or if he has an email. The provided graph and vector data are empty, which suggests that there may not be any relevant information about Bruce Esparza in the context."

In [None]:
chain.invoke(input="Who company is Bruce Esparza from? and is he has an email ?")

'Bruce Esparza is from Huerta-Mclean. Yes, he has an email address: preese@frye-vega.com.'

In [50]:
chain.invoke(input="Who works for the company Simon LLC ?")

'Two people work for Simon LLC: Clifford Jacobson and Chloe Hutchinson.'

In [26]:
chain.invoke(input="Who is Nonna Lucia? Did she teach anyone about restaurants or cooking?")

"Nonna Lucia was Amico's grandmother, a culinary sage who taught her grandchildren the art of Sicilian cooking. She also played a role in shaping Lucia's culinary talent and influence. There is no mention of Nonna Lucia teaching anyone about restaurants."

In [27]:
chain.invoke(input="What is La Dolce Vita?")

'La Dolce Vita was a restaurant owned by Antonio Caruso, located in Rome, known for its live classical music and blend of traditional and modern cuisine.'

In [35]:
chain.invoke(input="What is LA Tera Di Siena?")

'LA Terra di Siena appears to be the name of a restaurant, specifically owned by Sofia Caruso.'

In [None]:
chain.invoke(input="Who is Sofia Caruso?")

"Sofia Caruso is Antonio's wife and the co-owner of a trattoria with her husband Pietro. She is also a baker."

In [37]:
chain.invoke(input="Describe the relationship of the Sofia, Pietro, Antonio and Geovanni?")

"The relationships between Sofia, Pietro, Antonio, and Giovanni are as follows:\n\n- Sofia is Pietro's wife and a baker.\n- Pietro is Antonio's eldest son and a skilled fisherman.\n- Antonio is the father of both Pietro and Giovanni, and he hosts workshops in Rome to share his innovative techniques.\n\nThere is no direct information about the relationship between Sofia and Giovanni."