<a href="https://colab.research.google.com/github/aswinaus/graphrag/blob/main/Graph_RAG_GraphDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pyvis IPython cchardet datasets langchain==0.1.17 neo4j openai tiktoken langchain-community langchain-experimental json-repair


In [8]:
from getpass import getpass
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [None]:
!pip install numpy==1.25.2
!pip install pandas==2.0.3
!pip install datasets==2.14.5

In [12]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("aswinaus/tax_statistics_dataset_by_income_range", download_mode="force_redownload")
df=pd.DataFrame(dataset['train'])

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

ValueError: Invalid pattern: '**' can only be an entire path component

In [None]:
df.head(10)

Unnamed: 0,STATEFIPS,STATE,zipcode,Size of adjusted gross income,No of returns,No of single returns,No of joint returns,No of head of household returns,Number of electronically filed returns,Number of computer prepared paper returns,...,Number of returns with net investment income tax,Net investment income tax amount,Number of returns with tax due at time of filing,Tax due at time of filing amount,Number of returns with total overpayments,Total overpayments amount,Number of returns with overpayments refunded,Overpayments refunded amount,Number of returns with credit to next years estimated tax,Credited to next years estimated tax amount
0,1,AL,0,"$1 under $25,000",778210,491030,84770,189600,712890,30670,...,0,0,62720,51936,671860,1700965,669570,1694792,1980,3512
1,1,AL,0,"$25,000 under $50,000",525940,247140,123910,139860,481760,18960,...,0,0,85860,122569,438020,1274802,435210,1266557,3670,7410
2,1,AL,0,"$50,000 under $75,000",285700,105140,128140,44560,260570,10670,...,0,0,73980,154932,212040,575315,208470,564202,5020,13653
3,1,AL,0,"$75,000 under $100,000",179070,38820,123110,13740,164300,5020,...,0,0,51330,139065,126850,401581,123310,388749,3040,10377
4,1,AL,0,"$100,000 under $200,000",257010,28180,216740,7150,236850,8400,...,90,141,104290,460071,152790,598248,144640,539385,9180,56257
5,1,AL,0,"$200,000 or more",74810,4540,66580,530,69330,1760,...,39430,141243,40030,946610,32060,681565,22420,257633,9450,372747
6,2,AK,0,"$1 under $25,000",102820,81530,7850,10850,89210,6360,...,0,0,17720,8643,75910,133915,75430,132166,370,670
7,2,AK,0,"$25,000 under $50,000",79910,48680,14900,13510,72490,3260,...,0,0,11540,19564,68020,185024,67520,182576,740,1890
8,2,AK,0,"$50,000 under $75,000",51890,25460,17880,6840,46570,2160,...,0,0,11330,26920,40310,118691,39720,115674,830,2922
9,2,AK,0,"$75,000 under $100,000",36350,12880,19390,3190,32620,1630,...,0,0,9690,29647,26340,90982,25670,87439,680,2366


Following code imagine each state as a circle (a node) and you want to connect it to another circle representing the number of tax returns filed in that state. This line of code draws that connection (an edge) between the 'STATE' node and the 'No of returns' node. It also labels the connection with 'Size of adjusted gross income' to indicate the relationship between them.

Scenario:If the current row in the dataset has 'STATE' as 'CA' and 'No of returns' as 1000, this line would create an edge in the graph connecting a node labeled 'CA' to a node labeled '1000', and the edge would be labeled with 'Size of adjusted gross income'.

Nodes: In a graph, nodes (sometimes called vertices) are the fundamental entities. In this code, nodes represent things like U.S. states, or categories like "Number of Returns" or "Size of adjusted gross income."

Edges: Edges are the connections between nodes. They represent relationships. In this code, an edge connects a state node to a tax return statistic node, and the edge is labeled with a "Size of adjusted gross income" attribute.


In [None]:
# @title Knowledge Graph Builder & Visualizer
# @markdown Note that rendering the graph will take a minute or two under the default 10% sample size, longer with higher sample sizes.
sample_size = 0.66 # @param {type:"number", default:0.10}
import pandas as pd
import networkx as nx
from pyvis.network import Network
from IPython.display import IFrame
from IPython.display import Markdown, HTML

colors = {
    'STATE': 'Darkblue',
    'Number of electronically filed returns': 'orange',
    'No of returns': 'red',
    'No of single returns': 'green',
    'No of joint returns': 'magenta',
    'No of head of household returns': 'purple',
}
sizes = {
    'STATE': 20,
    'Number of electronically filed returns': 25,
    'No of returns': 25,
    'No of single returns': 25,
    'No of joint returns': 25,
    'No of head of household returns': 25
}

#Stratification is a sampling technique where you divide your data into subgroups (called strata) based on certain characteristics. This ensures that your sample has a similar distribution of these characteristics as the original dataset, making it more representative.
#code is indicating that the dataset will be divided into strata based on the values in the Size of adjusted gross income and 'STATE' columns.
stratify_cols = ['Size of adjusted gross income', 'STATE']
#This line sets the sample_size variable to 0.6, meaning that 60% of the data from each stratum will be sampled.
sample_size = 0.6

# This part uses the groupby() method to group the DataFrame df based on the columns specified in stratify_cols. The group_keys=False argument prevents the group keys from being added as index levels in the result.
# Second part applies a function to each group created by groupby(). The function used here is a lambda function that calls the sample() method on each group with frac=sample_size. This means it takes a random sample of the specified fraction (60% in this case) from each group.
sampled_df = df.groupby(stratify_cols, group_keys=False).apply(lambda x: x.sample(frac=sample_size))

# Initialize a directed graph
G = nx.DiGraph()

# Adding nodes with the entity type as a node attribute
#This line initiates a loop that iterates through each row of the sampled_df DataFrame.
#index: Represents the index of the current row.
#row: Represents the data within the current row, accessible like a dictionary (e.g., row['STATE'] gets the value in the 'STATE' column).
for index, row in sampled_df.iterrows():
    incometaxby_state_name=f"{row['STATE']}_{row['zipcode']}_{row['Size of adjusted gross income']}"
    #This condition checks if a node representing the current state (row['STATE']) already exists in the graph G. It proceeds only if the node doesn't exist.
    if row['STATE'] not in G:
      #This is the value used to identify the node, likely the state's name (e.g., 'CA', 'NY')
      G.add_node(row['STATE'],
                 #entity='STATE': This assigns an attribute named entity with the value 'STATE' to the node, categorizing it as a state node.
                 entity='STATE',
                 #color=colors.get('STATE', 'blue'): This sets the color of the node. It uses the colors dictionary to find a color associated with 'STATE'. If not found, it defaults to 'blue'.
                 color=colors.get('STATE', 'blue'),
                 #size=sizes.get('STATE', 5): Similar to color, this sets the size of the node based on the sizes dictionary, defaulting to 5 if not specified.
                 size=sizes.get('STATE', 5), )
#The above code goes through the dataset, and for each state it encounters that's not already in the graph, it creates a new node representing that state. The node is given attributes to define its type, color, and size within the visualization.

    if row['Size of adjusted gross income'] not in G:
      G.add_node(row['Size of adjusted gross income'],
                 entity='Size_of_adjusted_gross_income',
                 color=colors.get('Size of adjusted gross income', 'gray'),
                 size=sizes.get('Size of adjusted gross income', 40))

    #G.add_node(
    #           row['zipcode'],
    #           entity='ZIPCODE',
    #           color=colors.get('ZIPCODE', 'orange'),
    #           size=sizes.get('ZIPCODE', 20))
    if row['No of returns'] not in G:
      G.add_node(
               row['No of returns'],
               entity='No_of_returns',
               color=colors.get('No_of_returns', 'green'),
               size=sizes.get('No_of_returns', 25))

    if row['No of single returns'] not in G:
      G.add_node(
               row['No of single returns'],
               entity='No_of_single_returns',
               color=colors.get('No_of_single_returns', 'orange'),
               size=sizes.get('No_of_returns', 25))

    if row['No of joint returns'] not in G:
      G.add_node(
               row['No of joint returns'],
               entity='No_of_joint_returns',
               color=colors.get('No_of_joint_returns', 'magenta'),
               size=sizes.get('No_of_joint_returns',25))

      if row['No of head of household returns'] not in G:
        G.add_node(
               row['No of head of household returns'],
               entity='No_of_head_of_household_returns',
               color=colors.get('No_of_head_of_household_returns', 'purple'),
               size=sizes.get('No_of_head_of_household_returns',25))

        if row['Number of electronically filed returns'] not in G:
          G.add_node(
               row['Number of electronically filed returns'],
               entity='Number_of_electronically_filed_returns',
               color=colors.get('Number_of_electronically_filed_returns', 'Yellow'),
               size=sizes.get('Number_of_electronically_filed_returns',25))

    #using the networkx library in Python to add an edge (a connection) to a graph (G).
    #G is the graph object, add_edge is function in networkx adds a new edge(connection) to the graph G
    #row['STATE'] Refers to the value in the STATE column to the current row(row) ibeing processed from the dataset representing source node of the edge.
    #row['No of returns']: This refers to the value in the 'No of returns' column of the current row. It represents the target node of the edge.
    #relationship='Size of adjusted gross income': This assigns an attribute named relationship to the edge, and sets its value to 'Size of adjusted gross income'. This provides information about the type or nature of the connection between the two nodes.
    G.add_edge(row['STATE'], row['No of returns'],relationship='Size of adjusted gross income')
    G.add_edge(row['STATE'], row['No of single returns'],relationship='Size of adjusted gross income')
    G.add_edge(row['STATE'], row['No of joint returns'],relationship='Size of adjusted gross income')

    G.add_edge(row['STATE'], row['No of head of household returns'],relationship='Size of adjusted gross income')
    G.add_edge(row['STATE'], row['Number of electronically filed returns'],relationship='Size of adjusted gross income')


# Step 4: Visualization
# Convert to a pyvis network
nt = Network('1000px', '1000px', notebook=True, cdn_resources='in_line')
# if you are not in a Jupyter environment, you might need to set notebook=False
nt.from_nx(G)
nt.toggle_physics(True)  # Enable force-directed algorithm
nt.save_graph('income_tax_2019_graph.html')
nt.show('income_tax_2019_graph.html')

HTML('income_tax_2019_graph.html')

In [14]:
from neo4j import GraphDatabase
from google.colab import userdata

url = userdata.get('NEO4J_URI')
username ="neo4j"
password = userdata.get('NEO4J_PASSWORD')
os.environ["OPENAI_API_KEY"]=userdata.get('OPENAI_API_KEY')
driver = GraphDatabase.driver(url, auth=(username, password))

In [None]:
def get_entity_type(node):
    if isinstance(node, int):
        return "Number"  # Example: All integer nodes are 'Number' entity
    elif node.startswith("STATE_"):
        return "State"   # Example: Nodes starting with 'STATE_' are 'State' entity
    elif node.lower() in ["true", "false"]:
        return "Boolean"  # Example: Nodes with values 'true' or 'false' are 'Boolean'
    else:
        return "Unknown"   # Default entity type for unmatched nodes

In [None]:
#DG.add_nodes_from(node_names)
#DG.add_edges_from(edges)
#G is a variable that represents the graph object that was created earlier in the code using nx.DiGraph()

total_nodes = G.number_of_nodes()
total_edges = G.number_of_edges()
total_attributes = sum(len(G.nodes[node]) for node in G.nodes)

entity_attributes = nx.get_node_attributes(G, 'entity')

print(entity_attributes)
print(total_nodes)
print(total_edges)
print(total_attributes)
#For all the nodes if there is no entity attribute fill with an appropriate entity type
for node in G.nodes():
        if 'entity' not in G.nodes[node]:
          print(G.nodes[node])
          # Determine the entity type based on your logic or data source
          entity_type = get_entity_type(node)
          G.nodes[node]['entity'] = entity_type

**Following code takes a graph structure created in Python and efficiently stores it within a Neo4j database, using Cypher queries to create nodes and relationships and set their properties.**

In [None]:
def add_graph_to_neo4j(driver, graph):
    #driver: This is an object that represents the connection to the Neo4j database.
    #establishes a session with the Neo4j database. A session is like a temporary workspace where you can execute queries. The with statement ensures that the session is properly closed after use.
    with driver.session() as session:
        #Adding Nodes
        #graph: This is the graph object (G in the previous code) that you want to store in Neo4j.
        #MERGE (n:__Entity__ {name: $name, entity: $entity}) : Find a node with the given name and entity. If it doesn't exist, it creates one.
        #SET n += $props : Set properties on the node based on the node's attributes.
        #WITH n CALL apoc.create.addLabels(n, ['{label}']) YIELD node : Adds a specific label to the node using the APOC library function.
        #RETURN distinct 'done' AS result: Simply return 'done' to indicate success.
        #session.run() execute the Cypher query against the database.
        for node, attrs in graph.nodes(data=True):
            cypher_query = """
            MERGE (n:__Entity__ {{name: $name,entity:$entity}})
            SET n += $props
            WITH n
            CALL apoc.create.addLabels( n, ['{label}'] ) YIELD node
            RETURN distinct 'done' AS result
            """.format(label=attrs['entity'])  # Dynamically set the label based on the 'entity' attribute
            session.run(cypher_query, name=node, entity=attrs['entity'], props={k: v for k, v in attrs.items() if k not in ['entity']})


        #Adding Edges (Relationships)
        #Iterates through the edges (relationships) in the graph object.
        #graph.edges(data=True) gives the source node, target node, and edge attributes for each edge.
        #It constructs another Cypher query:
        #MATCH (a),(b) WHERE a.name = $source AND b.name = $target: This finds the source and target nodes by their names.
        #MERGE (a)-[r:{relationship}]->(b): This creates or updates the relationship between the nodes with the specified type.
        #SET r += $props: This sets properties on the relationship.
        #session.run() executes the query.
        for source, target, attrs in graph.edges(data=True):
            # Replace spaces in relationship type with underscores
            relationship_type = attrs['relationship'].replace(' ', '_')
            cypher_query = """
            MATCH (a),(b)
            WHERE a.name =$source and b.name=$target
            MERGE (a)-[r:{relationship}]->(b)
            SET r += $props
            """.format(relationship=relationship_type)  # Dynamically set the relationship type
            session.run(cypher_query, source=source, target=target, props={k: v for k, v in attrs.items() if k not in ['relationship']})

# Finally, call the function to add your graph to Neo4j
add_graph_to_neo4j(driver, G)


text_node_properties=['name', 'entity', 'zipcode'],
Imagine we have nodes in Neo4j database representing tax information. Each node might have properties like:

name: The name of a state (e.g., 'California').
entity: The type of entity the node represents (e.g., 'State', 'Tax Return').
zipcode: The zipcode associated with the data.
When the Neo4jVector is created with text_node_properties=['name', 'entity', 'zipcode'], code will:

Look at each node in the database.
Retrieve the values from the 'name', 'entity', and 'zipcode' properties of that node.
Combine those values into a single text string.
Use that text string to create an embedding (a numerical representation of the text) using OpenAI's embeddings.

These embeddings are then used for similarity searches. When you perform a search, your query is also converted into an embedding, and the Neo4jVector finds the nodes in the database with the most similar embeddings, meaning they are semantically related to your query.

In [29]:
import os
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings

#Initialize a Neo4jVector object and assigns it to the variable vector_index. It leverages the from_existing_graph method, indicating that it's working with an already populated Neo4j graph
vector_index = Neo4jVector.from_existing_graph(
    #This part specifies that the index will utilize OpenAI's embeddings.
    OpenAIEmbeddings(),
    #These parameters provide the connection details for the Neo4j database. url points to the database's address, while username and password are used for authentication.
    url=url,
    username=username,
    password=password,
    #This sets the name of the vector index to 'incometax'. This name will be used to refer to the index within Neo4j.
    index_name='incometax',
    #This indicates that the vector index should be built on nodes with the label "__Entity__" in the graph. In Neo4j, labels categorize nodes with similar characteristics.
    node_label="__Entity__",
    #Use the values stored in the 'name', 'entity', and 'zipcode' properties of each node to generate embeddings."
    text_node_properties=['name', 'entity', 'zipcode'],
    embedding_node_property='embedding',
)



In [48]:
response = vector_index.similarity_search(
    "What is the no of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000?"
)

In [49]:
print(response[0].page_content)


name: 20060
entity: No_of_single_returns
zipcode: 


In [50]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

vector_qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(),
    #This chooses the "stuff" chain type, which means the retrieved documents will be inserted directly into the prompt for the LLM.
    chain_type="stuff",
    #connect the chain to the vector_index (created in the previous part of code) which is used to retrieve relevant information from Neo4j database. .as_retriever() turns the vector_index into a retriever object that the RetrievalQA chain can use.
    retriever=vector_index.as_retriever()
)
vector_qa.run(
    #"Compare No of returns,	No of single returns,	No of joint returns and	No of head of household returns for States CA and AL in all the income range?"
    "What is the no of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000?"
)

"I'm sorry, but I don't have enough information to answer your question accurately."

Defines two prompt templates using LangChain's PromptTemplate class: CYPHER_GENERATION_PROMPT and CYPHER_QA_PROMPT. These templates are essentially blueprints for creating prompts that will be fed to a large language model (LLM).
In essence, these prompt templates ensure that the LLMs are properly guided to generate Cypher queries and provide meaningful answers based on the data retrieved from the Neo4j database.

In [51]:
from langchain.prompts import PromptTemplate
#This template provides clear instructions to the LLM, emphasizing that it should only use the schema provided and strictly focus on generating a Cypher query.
#Placeholders below
#{schema}: This will be replaced with the actual schema of the database.
#{question}: This will be replaced with the user's question.
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema. However, always exclude the schema's `embedding` property from the Cypher statement.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

The question is:
{question}

Cypher Query:
"""

#This template guides the LLM to format the answer nicely and instructs it to rely solely on the provided context (results from the Cypher query) without using its own knowledge.
#It includes placeholders: {context}: This will be replaced with the data retrieved from the database.
#{question}: This will be replaced with the original user question.
CYPHER_QA_TEMPLATE = """You are an AI assistant that helps to form nice and human understandable answers.
The information part contains the provided information that you must use to construct an answer.
The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
Make the answer sound as a response to the question. Do not mention that you based the result on the given information.
Here is an example:

Question: Which state has the maximum number of returns?
Context:[{{'STATE': 'CA'}}, {{'No_of_return': '5506120'}}]
Helpful Answer: The state CA has the maximum number of returns with 5506120

Follow this example when generating answers.
If the provided information is empty, say that you don't know the answer.

Information:
{context}

Question: {question}
Helpful Answer:"""
#This prompt is designed to instruct the LLM to generate a Cypher query (Neo4j's query language) based on a user's question and a provided schema of the graph database.
#create the CYPHER_GENERATION_PROMPT object using the PromptTemplate class.
CYPHER_GENERATION_PROMPT = PromptTemplate(
    #Use the CYPHER_GENERATION_TEMPLATE as the base structure.
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)
#This prompt is used to instruct the LLM to generate a human-readable answer based on the results returned from the Cypher query.
CYPHER_QA_PROMPT = PromptTemplate(
    input_variables=["context", "question"], template=CYPHER_QA_TEMPLATE
)

In [52]:
from langchain.chains import TransformChain

def transform_cypher(input):
  # Replace spaces in labels with backticks:
  modified_cypher = input['cypher_statement'].replace("MATCH (n:Size of adjusted gross income)", "MATCH (n:`Size of adjusted gross income`)")
  # Add more replacements for other labels/properties with spaces as needed.
  return {'modified_cypher': modified_cypher}

Code sets up a question-answering system that uses a Neo4j graph database. It defines a connection to the database, refreshes the schema, and creates a chain using OpenAI's LLMs to translate user questions into Cypher queries, execute those queries against the database, and then format the results into human-readable answers. The CYPHER_QA_PROMPT and CYPHER_GENERATION_PROMPT above play a crucial role in guiding the LLMs to perform these tasks accurately.

In [54]:
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password,
    enhanced_schema=True
)
#refresh the schema to latest
graph.refresh_schema()

#transform_chain = TransformChain(input_variables=["cypher_statement"], output_variables=["modified_cypher"], transform_function=transform_cypher)

cypher_chain = GraphCypherQAChain.from_llm(
    cypher_llm = ChatOpenAI(temperature=0, model_name='gpt-4'),
    qa_llm = ChatOpenAI(temperature=0),
    graph=graph,
    verbose=True,
    qa_prompt=CYPHER_QA_PROMPT,
    cypher_prompt=CYPHER_GENERATION_PROMPT
)

#cypher_chain.llm_chain.prompt.template = "{cypher_statement}"
#cypher_chain = transform_chain | cypher_chain



In [45]:
cypher_chain.run(
    "What is the no of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000?"
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:STATE {name: 'AL'})-[:Size_of_adjusted_gross_income]->(n:No_of_returns {name: '100,000 under 200,000'}) RETURN n.name[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I don't know the answer."

Code creates an AI agent that can answer tax-related questions by using either a similarity search (VectorSearch) or a graph database query (GraphSearch). The agent uses GPT-4 to figure out the best tool for each question.

Code defines two tools which the TaxAgent will have access to:

VectorSearch: This tool uses a technique called similarity search to find relevant information. It's good for general questions about tax statistics. It likely uses the vector_qa object (defined earlier in the code, but not shown here) to perform the search.

GraphSearch: This tool uses a graph database (Neo4j) to find more specific, analytical information. It likely uses the cypher_chain object (also defined earlier) to query the database.
Each tool has a name, a func (the function it calls to do its work), and a description that tells the agent what it's good for.

In [38]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType

tools = [
    Tool(
        name="VectorSearch",
        func=vector_qa.run,
        description="""Answer questions related to tax statistics.
        For non analytic questions, use the VectorSearch tool.
        Always have complete questions as input.
        """,
    ),
    Tool(
        name="GraphSearch",
        func=cypher_chain.run,
        description="""Useful for questions related to tax mostly analytical data querying,
        Always have complete questions as input.
        """,
    ),
]
#Agent Initialization
#initialize_agent: This function from LangChain is used to create the agent.
#tools: The list of tools (VectorSearch and GraphSearch) is passed to the agent, giving it access to these functionalities.
#ChatOpenAI(temperature=0, model_name='gpt-4'): This specifies that the agent will use OpenAI's GPT-4 language model as its reasoning engine. temperature=0 makes the model's responses more deterministic.
#agent=AgentType.OPENAI_FUNCTIONS: This selects a specific agent type from LangChain designed for interacting with tools using OpenAI's function calling capabilities.
#verbose=True: This setting enables detailed logging of the agent's actions, which is helpful for debugging and understanding how it's making decisions.
TaxAgent = initialize_agent(
    tools,
    ChatOpenAI(temperature=0, model_name='gpt-4'),
    agent=AgentType.OPENAI_FUNCTIONS, verbose=True
)

In [40]:
response = TaxAgent.invoke({"input": "What is the no of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000?"})
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `GraphSearch` with `What is the number of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000?`


[0m

[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:STATE {name: 'AL'})-[:Size_of_adjusted_gross_income]->(n:No_of_returns {name: '100,000 under 200,000'}) RETURN n.name[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
[33;1m[1;3mI don't know the answer.[0m[32;1m[1;3mI'm sorry, but I couldn't find the specific number of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000. You may want to check the latest tax statistics from the IRS or the state's Department of Revenue for the most accurate information.[0m

[1m> Finished chain.[0m
{'input': 'What is the no of returns for the state AL for the Size of adjusted gross income range $100,000 under $200,000?', 'output': "I'm sor

In [None]:
response = TaxAgent.invoke({"input": "Compare No of returns,	No of single returns,	No of joint returns and	No of head of household returns for States CA and AL in all the income range?"})
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `GraphSearch` with `Compare No of returns, No of single returns, No of joint returns and No of head of household returns for States CA and AL in all the income range`


[0m

[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:STATE)-[:Size_of_adjusted_gross_income]->(n:No_of_returns), (s:STATE)-[:Size_of_adjusted_gross_income]->(sr:No_of_single_returns), (s:STATE)-[:Size_of_adjusted_gross_income]->(jr:No_of_joint_returns), (s:STATE)-[:Size_of_adjusted_gross_income]->(hr:No_of_head_of_household_returns)
WHERE s.name IN ['CA', 'AL']
RETURN s.name, n.name AS No_of_returns, sr.name AS No_of_single_returns, jr.name AS No_of_joint_returns, hr.name AS No_of_head_of_household_returns[0m
Full Context:
[32;1m[1;3m[{'s.name': 'AL', 'No_of_returns': 257010, 'No_of_single_returns': 28180, 'No_of_joint_returns': 216740, 'No_of_head_of_household_returns': 7150}, {'s.name': 'AL', 'No_of_r

In [None]:
!pip install langchain langchain_community langchain_openai chromadb --quiet
!pip install --upgrade langchain

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain_core.runnables import (
    RunnableParallel,
    RunnablePassthrough
)
from langchain.schema.output_parser import StrOutputParser

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Download Data
data_dir = '/content/drive/MyDrive'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from typing import Dict, Any, List
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Convert DatasetDict to LangChain Documents
def create_langchain_documents(dataset: Dict[str, Any]) -> List[Document]:
    """Converts a Hugging Face DatasetDict to a list of LangChain Documents,
    including all columns as content.
    """
    documents = []
    for row in dataset['train']:  # Assuming 'train' is your split name
        # Concatenate all column values into a single string
        content = "\n".join([f"{k}: {v}" for k, v in row.items()])

        # Use all columns except 'content' as metadata
        metadata = row.copy()

        document = Document(page_content=content, metadata=metadata)
        documents.append(document)
    return documents

In [None]:
all_documents = create_langchain_documents(dataset)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50,
)

In [None]:
pages = []
for document in all_documents:
  pages.extend(text_splitter.split_documents([document]))

In [None]:
# create vector store with Chroma
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata # import filter_complex_metadata

vectordb = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]),persist_directory=f"{data_dir}/RAG/VectorDB/chroma_db_RAG_Income_Tax")
vectordb.persist()
retriever = vectordb.as_retriever()

  vectordb.persist()


In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
!pip install --upgrade langchain pydantic

In [None]:
#Creating a RAG Pipeline
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableSequence
from langchain_core.runnables.utils import Input, Output
from langchain_core.output_parsers import StrOutputParser
# RAG
template = """You are an AI Tax assistant.Answer the following question based on this context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])
final_rag_chain = RunnableSequence(
    #{"context": retriever | format_docs, "question": RunnablePassthrough()}
    RunnablePassthrough.assign(
        context=lambda x: format_docs(vectordb.similarity_search(x["question"], k=10)),
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
question="What is the number of joint returns for the state of AL for income range $100,000 under $200,000?"


In [None]:
final_rag_chain.invoke({"question":question})


'The number of joint returns for the state of AL in the income range of $100,000 under $200,000 is 216,740.'

In [None]:
questions = [
    "What is the number of returns for the state of AL for income range $1 under $25,000?",
    "What is the number of joint returns for the state of AK for income range $100,000 under $200,000?",
]
ground_truth = [
    "778210",
    "47720",
]

In [None]:
!pip install datasets --quiet
from datasets import Dataset

In [None]:
answers  = []
contexts = []

# traversing each question and passing into the chain to get answer from the system
for question in questions:
    answers.append(final_rag_chain.invoke({"question":question}))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(question)])

# Preparing the dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truth
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

dataset.to_pandas()

NameError: name 'vector_qa' is not defined

In [None]:
!pip install ragas --quiet
import ragas

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/177.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.1/177.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()
df

Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall,faithfulness,answer_relevancy
0,What is the number of returns for the state of...,[STATEFIPS: 1\nSTATE: AL\nzipcode: 0\nSize of ...,"Based on the provided data, the number of retu...",778210,0.0,0.0,0.666667,0.0
1,What is the number of joint returns for the st...,[STATEFIPS: 2\nSTATE: AK\nzipcode: 0\nSize of ...,"Based on the provided data, the number of join...",47720,0.0,0.0,0.666667,0.0
