**Install Dependencies & Libraries**

In [35]:
!pip install -U -qq langchain langgraph langchain_openai langchain_community faiss-cpu
!pip install -U -qq pypdf

import os
from typing import TypedDict, List  # For type hinting


from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
from langchain.chains.question_answering import load_qa_chain

**Setup API Key**

In [36]:
api_key = input("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = api_key

Enter your OpenAI API key: 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


**Load and Prepare Documents**

In [37]:
# Modular file path selection
print("Choose file source:")
print("1. Use CSV from GitHub repo")
print("2. Use CSV from Google Drive")

choice = input("Enter 1 or 2: ")

if choice == "1":
    file_path = "company_financials_quarterly.csv"
    print("Using file from GitHub repo: ",file_path)

elif choice == "2":
    from google.colab import drive
    drive.mount("/content/drive")
    file_path = "/content/drive/MyDrive/LangGraph/company_financials_quarterly.csv"
    print("Using file from Google Drive: ",file_path)

else:
    raise ValueError("Invalid choice. Please enter 1 or 2.")

Choose file source:
1. Use CSV from GitHub repo
2. Use CSV from Google Drive
Enter 1 or 2: 2
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using file from Google Drive:  /content/drive/MyDrive/LangGraph/company_financials_quarterly.csv


**Split data into Chunks, convert to Emebedding and store in vector DB**

In [38]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print("Split into", len(chunks), "chunks")

embeddings = OpenAIEmbeddings()
knowledge_base = FAISS.from_documents(chunks, embeddings)
retriever = knowledge_base.as_retriever()

Split into 100 chunks


**Setup LLM and QA Chain**

In [39]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa_chain = load_qa_chain(llm, chain_type="stuff")

**Define State for LangGraph**

In [40]:
class RAGState(TypedDict):
    question: str
    documents: List[str]
    answer: str
    intent: str

**Define Graph Nodes**

In [41]:
# Intent Detection Node
def detect_intent(state: RAGState):
    return {"intent": "retrieval"}

# Retrieval Node
def retrieve(state: RAGState):
    docs = retriever.get_relevant_documents(state["question"])
    return {"documents": docs}

# LLM Response Node
def generate(state: RAGState):
    answer = qa_chain.run(
        {"question": state["question"], "input_documents": state.get("documents", [])}
    )
    return {"answer": answer}


**Build LangGraph Workflow with Memory**

In [42]:
graph = StateGraph(RAGState)

graph.add_node("detect_intent", detect_intent)
graph.add_node("retrieve", retrieve)
graph.add_node("generate", generate)

graph.set_entry_point("detect_intent")
graph.add_conditional_edges(
    "detect_intent",
    lambda state: state["intent"],
    {"retrieval": "retrieve"},
)
graph.add_edge("retrieve", "generate")
graph.add_edge("generate", END)

# Memory
memory = MemorySaver()
app = graph.compile(checkpointer=memory)


**Testing the query engine (optional block)**

In [43]:
thread_id = "demo_queries"

q1 = "What information does the dataset contain?"
out1 = app.invoke({"question": q1}, config={"configurable": {"thread_id": thread_id}})
print("\n\n**********\n")
print("Q:", q1)
print("A:", out1["answer"])

q2 = "What was the revenue in Q2 2010?"
out2 = app.invoke({"question": q2}, config={"configurable": {"thread_id": thread_id}})
print("\nQ:", q2)
print("A:", out2["answer"])



**********

Q: What information does the dataset contain?
A: The dataset contains financial information for a company over different years and quarters. This information includes revenue, expenses, profit, growth rate, and the number of employees for each quarter in the years 2001, 2004, 2009, and 2011.

Q: What was the revenue in Q2 2010?
A: The revenue in Q2 2010 was 194.23 million USD.


**Main thread - Interactive Chat**

In [44]:
thread_id = "company_revenue"

while True:
    user_query = input("User: ")
    if user_query.lower() in ["exit", "quit"]:
        print("Chat ended.")
        break

    output = app.invoke(
        {"question": user_query},
        config={"configurable": {"thread_id": thread_id}}
    )
    print("AI:", output["answer"])
    print("________________________________\n")

User: What information does the dataset contain?
AI: The dataset contains financial information for a company over different years and quarters. This information includes revenue, expenses, profit, growth rate, and the number of employees for each quarter in the years 2001, 2004, 2009, and 2011.
________________________________

User: What was the revenue in Q2 2015?
AI: The revenue in Q2 2015 was 263.89 million USD.
________________________________

User: What year has got the best ratio of revenue:employee?
AI: To determine the year with the best ratio of revenue per employee, we need to calculate the revenue per employee for each year. 

For 2011:
Revenue per employee = Revenue / Employees = 257.62 / 1709 ≈ 0.151 million USD

For 2009:
Revenue per employee = Revenue / Employees = 379.66 / 2865 ≈ 0.133 million USD

For 2007:
Revenue per employee = Revenue / Employees = 301.52 / 1449 ≈ 0.208 million USD

Comparing these values, the year 2007 has the best ratio of revenue per employee 