**Install Dependencies and Libraries**

In [None]:
!pip install -U -qq langchain langgraph langchain_openai langchain_community faiss-cpu
!pip install -U -qq pypdf

import os
from typing import TypedDict, List

from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
from langchain.chains.question_answering import load_qa_chain

**Setup API Key**

In [25]:
# Option 1: Read from environment variable (preferred)
api_key = os.getenv("OPENAI_API_KEY")

# Option 2: Ask user to input if not set
if not api_key:
    api_key = input("Enter your OpenAI API key: ")

os.environ["OPENAI_API_KEY"] = api_key

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load and Prepare Documents**

In [23]:
# Load CSV document (stored in the repo under /data folder)
from langchain_community.document_loaders import CSVLoader

# Path relative to your GitHub repo
file_path = "data/company_financials_quarterly.csv"

loader = CSVLoader(file_path=file_path, encoding="utf-8")
docs = loader.load()

print("Loaded", len(docs), "rows from document.")

Loaded 100 pages from document.


**Split data into Chunks, convert to Emebedding and store in vector DB**

In [24]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print("Split into", len(chunks), "chunks")

embeddings = OpenAIEmbeddings()
knowledge_base = FAISS.from_documents(chunks, embeddings)
retriever = knowledge_base.as_retriever()

Split into 100 chunks


**Setup LLM and QA Chain**

In [8]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
qa_chain = load_qa_chain(llm, chain_type="stuff")

stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  qa_chain = load_qa_chain(llm, chain_type="stuff")


**Define State for LangGraph**

In [7]:
class RAGState(TypedDict):
    question: str
    documents: List[str]
    answer: str
    intent: str

**Define Graph Nodes**

In [9]:
# Intent Detection Node (simple: all go to retrieval)
def detect_intent(state: RAGState):
    return {"intent": "retrieval"}

# Retrieval Node
def retrieve(state: RAGState):
    docs = retriever.get_relevant_documents(state["question"])
    return {"documents": docs}

# LLM Response Node
def generate(state: RAGState):
    answer = qa_chain.run(
        {"question": state["question"], "input_documents": state.get("documents", [])}
    )
    return {"answer": answer}


**Build LangGraph Workflow with Memory**

In [10]:
graph = StateGraph(RAGState)

graph.add_node("detect_intent", detect_intent)
graph.add_node("retrieve", retrieve)
graph.add_node("generate", generate)

graph.set_entry_point("detect_intent")
graph.add_conditional_edges(
    "detect_intent",
    lambda state: state["intent"],
    {"retrieval": "retrieve"},
)
graph.add_edge("retrieve", "generate")
graph.add_edge("generate", END)

# Memory
memory = MemorySaver()
app = graph.compile(checkpointer=memory)


**Testing the query engine (optional block)**

In [11]:
thread_id = "demo_queries"

q1 = "What information does the dataset contain?"
out1 = app.invoke({"question": q1}, config={"configurable": {"thread_id": thread_id}})
print("Q:", q1)
print("A:", out1["answer"])

q2 = "What was the revenue in Q2 2010?"
out2 = app.invoke({"question": q2}, config={"configurable": {"thread_id": thread_id}})
print("\nQ:", q2)
print("A:", out2["answer"])

  docs = retriever.get_relevant_documents(state["question"])
  answer = qa_chain.run(


Q: What information does the dataset contain?
A: The dataset contains financial information for a company over different years and quarters. This information includes revenue, expenses, profit, growth rate, and the number of employees for each quarter in the years 2000, 2001, 2004, and 2011.

Q: What was the revenue in Q2 2010?
A: The revenue in Q2 2010 was 194.23 million USD.


**Main thread - Interactive Chat**

In [14]:
thread_id = "company_revenue"

while True:
    user_query = input("User: ")
    if user_query.lower() in ["exit", "quit"]:
        print("Chat ended.")
        break

    output = app.invoke(
        {"question": user_query},
        config={"configurable": {"thread_id": thread_id}}
    )
    print("AI:", output["answer"])

User: WHat is the year which got least and the highest revenues?
AI: The year with the least revenue is 2008 with a revenue of 227.88 Million USD, and the year with the highest revenue is 2013 with a revenue of 472.51 Million USD.
User: exit
Chat ended.
