# Generate graphRAG for Financial Data #

## Imports ##

In [1]:
import os
import glob
import time
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
from typing import List

from langchain_community.document_loaders import TextLoader
from langchain_community.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain_text_splitters import TokenTextSplitter
from neo4j.exceptions import ClientError

from time import sleep
import hashlib



## Initializations ##

In [2]:
# Load from environment
load_dotenv('.env', override=True)

AZURE_OPENAI_ENDPOINT=os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY=os.getenv("AZURE_OPENAI_API_KEY"), 



# Embeddings & LLM models
embedding_dimension = 1536
embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3",api_version="2024-02-01",dimensions=embedding_dimension)

llm = AzureChatOpenAI(azure_deployment='chat_gtp_35',api_version="2023-05-15", temperature=0)

# Get Neo4j credentials from environment variables
NEO4J_URI=os.getenv("NEO4J_URI")
NEO4J_USERNAME=os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD=os.getenv("NEO4J_PASSWORD")

graph = Neo4jGraph(url=NEO4J_URI,username=NEO4J_USERNAME,password=NEO4J_PASSWORD)
sleep(2)

#Clear KG from previous sessions
# graph.refresh_schema()
# graph.query("MATCH (n) DETACH DELETE n")
# graph.query("DROP INDEX hypothetical_questions IF EXISTS")
# graph.query("DROP INDEX parent_document IF EXISTS")
# graph.query("DROP INDEX summary IF EXISTS")
# graph.query("DROP INDEX typical_rag IF EXISTS")
# graph.query("""
#   SHOW VECTOR INDEXES
#   """
# )


## Data ##

In [3]:
# # # pip install sec-api
# # # from sec_api import ExtractorApi


# API_KEY=os.getenv("SEC_API")

# extractorApi = ExtractorApi(API_KEY)

# def extract_items_10k(filing_url):
    
#     items = ["1", "1A", "7",
#            "7A"]
#     filing_name = os.path.basename(filing_url)

#     for item in items:
#         print("item:", item, "url", filing_url)

#         try:
#             section_text = extractorApi.get_section(filing_url=filing_url,
#                                               section=item,
#                                               return_type="text")

#         # do something with section_text. for example, save to disk, in a database
#         # or perform analytics
#         # IMPORTANT: you don't want to hold a large number of sections in memory
#         # and add sections to a list. otherwise you end up with out-of-memory issues.
#         # instead make sure to let the garbage collection release memory frequently.

#         # Create a new directory for the current filing if it doesn't exist
#             if not os.path.exists(filing_name):
#                 os.makedirs(filing_name)

#             # Save the extracted text to a file
#             with open(f"{filing_name}/{item}.txt", "w") as f:
#                 f.write(section_text)
#         except Exception as e:
#             print(e)

# urls_10k = ["https://www.sec.gov/Archives/edgar/data/789019/000156459022026876/msft-10k_20220630.htm"]



In [4]:
    # for url in urls_10k:
    #     extract_items_10k(url)

## Data Ingestion Code ##

In [29]:

def extract_entities_relationships(folder):
    # Fetch all text files in the specified folder
    files = glob.glob(f'./sec_10K_data/{folder}/*.txt')
    start = time.perf_counter()
    print(f"Running pipeline for {len(files)} files in {folder} folder")

    # Generate a unique ID for the folder using SHA-256 hash
    folder_id = f"folder-{hashlib.sha256(folder.encode()).hexdigest()}"
    
    # Create or merge a Folder node in the graph with the generated ID
    graph.query(
        """
        MERGE (f:Folder {id: $folder_id})
        ON CREATE SET f.name = $folder_name
        """,
        {"folder_id": folder_id, "folder_name": folder}
    )

    folder_query = """
    MATCH (f:Folder {id: $folder_id})
    RETURN coalesce(f.name, 'Unknown Folder') AS folder_name
    """
    try:
        # Retrieve the name of the folder node from the graph
        result = graph.query(folder_query, params={"folder_id": folder_id})
        if result and len(result) > 0:
            folder_name = result[0]["folder_name"]
    except Exception as e:
        print(f"Failed folder name id retrieve: {e}")
        return None

    # Generate a unique ID for the Company using the first four letters of the folder name
    company_name = folder[:4]
    company_id = f"company-{hashlib.sha256(company_name.encode()).hexdigest()}"
    
    # Create or merge a Company node in the graph with the generated ID
    graph.query(
        """
        MERGE (c:Company {id: $company_id})
        ON CREATE SET c.name = $company_name
        """,
        {"company_id": company_id, "company_name": company_name}
    )

    # Link the Company node to Folder nodes whose names match the first four letters of the company node name
    graph.query(
        """
        MATCH (f:Folder)
        WHERE f.name STARTS WITH $company_name
        MERGE (c:Company {id: $company_id})
        MERGE (c)-[:COMPANY]->(f)
        """,
        {"company_id": company_id, "company_name": company_name}
    )

     # Generate a unique ID for the Node_10 using SHA-256 hash
    Node_10K_id = f"KB_seed-{hashlib.sha256(folder.encode()).hexdigest()}"
    
    # Create KG_SOURCE relationships between company nodes and the single SEC_10K node
    graph.query(
                """
                MATCH (c:Company)
                MERGE (k:KG_seed {id: $Node_10K_id})
                MERGE (k)-[:KG_SOURCE]->(c)
                """,
                {"Node_10K_id": "SEC_10K"},
            )


    for file_path in files:
        print(f"Extracting entities and relationships for: {str(file_path)}")

        # Load the document from the file
        loader = TextLoader(str(file_path), encoding='utf-8')
        documents = loader.load()

        # Split the document into parent and child chunks
        parent_splitter = TokenTextSplitter(chunk_size=512*5, chunk_overlap=24)
        child_splitter = TokenTextSplitter(chunk_size=100*5, chunk_overlap=24)
        parent_documents = parent_splitter.split_documents(documents)

        parent_ids = []  # List to store parent IDs

        for i, parent in enumerate(parent_documents):
            # Split the parent document into child documents
            child_documents = child_splitter.split_documents([parent])
            file_name = os.path.basename(file_path)
            parent_id = f"{file_name}-{hashlib.sha256(parent.page_content.encode()).hexdigest()}"
            parent_ids.append(parent_id)  # Store the parent ID

            params = {
                "parent_text": parent.page_content,
                "parent_id": parent_id,  # Use the file name and hash of the content as the ID
                "parent_embedding": embeddings.embed_query(parent.page_content),
                "children": [
                    {
                        "text": c.page_content,
                        "id": f"{file_name}-{hashlib.sha256(c.page_content.encode()).hexdigest()}",  # Use the file name and hash of the content as the ID
                        "embedding": embeddings.embed_query(c.page_content),
                    }
                    for c in child_documents
                ],
            }

            # Ingest parent and child data into the graph
            graph.query(
                """
                MERGE (p:Parent {id: $parent_id})
                ON CREATE SET p.text = $parent_text
                WITH p
                CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
                YIELD node
                WITH p
                UNWIND $children AS child
                MERGE (c:Child {id: child.id})
                ON CREATE SET c.text = child.text
                MERGE (c)<-[:HAS_CHILD]-(p)
                WITH c, child
                CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
                YIELD node
                RETURN count(*)
                """,
                params,
            )

            # Create vector index for child nodes if it doesn't exist
            try:
                graph.query(
                    "CALL db.index.vector.createNodeIndex('parent_document', "
                    "'Child', 'embedding', $dimension, 'cosine')",
                    {"dimension": embedding_dimension},
                )
            except ClientError:  # Index already exists
                pass

            # Create vector index for parent nodes if it doesn't exist
            try:
                graph.query(
                    "CALL db.index.vector.createNodeIndex('typical_rag', "
                    "'Parent', 'embedding', $dimension, 'cosine')",
                    {"dimension": embedding_dimension},
                )
            except ClientError:  # Index already exists
                pass

        # Create NEXT relationships between consecutive parent nodes
        for j in range(len(parent_ids) - 1):
            graph.query(
                """
                MATCH (p1:Parent {id: $parent_id_1}), (p2:Parent {id: $parent_id_2})
                MERGE (p1)-[:NEXT]->(p2)
                """,
                {"parent_id_1": parent_ids[j], "parent_id_2": parent_ids[j + 1]},
            )

        # Create GRAND_FATHER relationships between the folder and parent nodes
        for parent_id in parent_ids:
            graph.query(
                """
                MATCH (f:Folder {id: $folder_id}), (p:Parent {id: $parent_id})
                MERGE (p)-[:GRAND_FATHER]->(f)
                """,
                {"folder_id": folder_id, "parent_id": parent_id},
            )
    
    end = time.perf_counter()
    print(f"Pipeline completed in {end-start} seconds")    

def ingestion_pipeline(folders):
    # Run the extract_entities_relationships function for each folder in the list
    for folder in folders:
        extract_entities_relationships(folder)


## Process Data Ingestion ##

In [30]:
#Clear KG from previous session
graph.refresh_schema()
graph.query("MATCH (n) DETACH DELETE n")
graph.query("DROP INDEX hypothetical_questions IF EXISTS")
graph.query("DROP INDEX parent_document IF EXISTS")
graph.query("DROP INDEX summary IF EXISTS")
graph.query("DROP INDEX typical_rag IF EXISTS")

[]

In [31]:
folders = ["aapl-20220924","aapl-20230930","msft-20220630","nvda-20240128"]
ingestion_pipeline(folders)

Running pipeline for 4 files in aapl-20220924 folder
Extracting entities and relationships for: ./sec_10K_data/aapl-20220924\1.txt
Extracting entities and relationships for: ./sec_10K_data/aapl-20220924\1A.txt
Extracting entities and relationships for: ./sec_10K_data/aapl-20220924\7.txt
Extracting entities and relationships for: ./sec_10K_data/aapl-20220924\7A.txt
Pipeline completed in 18.480374099999608 seconds
Running pipeline for 4 files in aapl-20230930 folder
Extracting entities and relationships for: ./sec_10K_data/aapl-20230930\1.txt
Extracting entities and relationships for: ./sec_10K_data/aapl-20230930\1A.txt
Extracting entities and relationships for: ./sec_10K_data/aapl-20230930\7.txt
Extracting entities and relationships for: ./sec_10K_data/aapl-20230930\7A.txt
Pipeline completed in 16.342698299999938 seconds
Running pipeline for 4 files in msft-20220630 folder
Extracting entities and relationships for: ./sec_10K_data/msft-20220630\1.txt
Extracting entities and relationships

## Test RAG ##

In [None]:
from langchain_community.vectorstores import Neo4jVector




# Parent retriever

parent_query = """
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata LIMIT 1
"""

parent_vectorstore = Neo4jVector.from_existing_index(
    embeddings,
    index_name="parent_document",
    retrieval_query=parent_query,
)



In [None]:
response = typical_rag.similarity_search(
    "What are some risk factors that can affect Apple's stock price?"
)
print(response[0].page_content)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI



vector_parent = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=parent_vectorstore.as_retriever()
)



vector_parent.invoke(
     "What are some risk factors that can affect Apple's stock price?"
)

##RAG Agent Tool (combination of retrievers)

In [None]:
from langchain.chains import GraphCypherQAChain
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType


cypher_chain = GraphCypherQAChain.from_llm(
    cypher_llm = AzureChatOpenAI(azure_deployment='Chat_gpt_4',api_version="2023-05-15", temperature=0),
    qa_llm = AzureChatOpenAI(azure_deployment='chat_gtp_35',api_version="2023-05-15", temperature=0), graph=graph, verbose=True,
)

tools = [
    Tool(
        name="Tasks",
        func=vector_typrag.run,
        description="""Useful to answer most of the questions.
        Not useful for questions that involve aggregation.
        Use full question as input.
        """,
        
    ),
    Tool(
        name="Tasks",
        func=vector_hypquestion.run,
        description="""Useful to answer questions on dates and relationship between different companies.
        Not useful for questions that involve aggregation.
        Use full question as input.
        """,        
        
    ),
    # Tool(
    #     name="Graph",
    #     func=cypher_chain.run,
    #     description=""" Only useful for AGGREGATION questions.
    #     Use full question as input.
    #     """,
    # ),
]

mrkl = initialize_agent(
    tools, 
    AzureChatOpenAI(azure_deployment='Chat_gpt_4',api_version="2023-05-15", temperature=0),
    agent=AgentType.OPENAI_FUNCTIONS, verbose=True
)


In [None]:
graph.refresh_schema()
response = mrkl.invoke("What are some risk factors that can affect Apple's stock price?")
print(response)

In [None]:
graph.refresh_schema()
response = mrkl.invoke("What kind of business Apple handles?")
print(response)

In [None]:
# pip freeze >requirements.txt

In [None]:
# import sys
# print("Python version: " + sys.version)