In [None]:
import os
import glob

# Load PDFs using a LangChain PDF loader (e.g., PyPDFLoader)
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# Import LangGraph for orchestration
from langgraph import Graph, Node

In [None]:
# ----------------------------
# Step 1: Load FATCA Regulation PDFs
# ----------------------------

def load_fatca_pdfs(pdf_dir: str):
    pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
    documents = []
    for pdf_file in pdf_files:
        print(f"Loading {pdf_file} ...")
        loader = PyPDFLoader(pdf_file)
        # loader.load() returns a list of Document objects
        docs = loader.load()
        documents.extend(docs)
    return documents

pdf_dir = "./fatca_regulations"
documents = load_fatca_pdfs(pdf_dir)

# ----------------------------
# Step 2: Create a ChromaDB Vector Store from the PDFs
# ----------------------------
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    collection_name="fatca_collection"
)

In [None]:
# ----------------------------
# Step 2: Create a ChromaDB Vector Store from the PDFs
# ----------------------------
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    collection_name="fatca_collection"
)

In [None]:
# ----------------------------
# Step 3: Set Up a RetrievalQA Chain with LangChain
# ----------------------------
llm = OpenAI(temperature=0.2)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Using a simple chain for demonstration
    retriever=vector_store.as_retriever(search_kwargs={"k": 5})
)

In [None]:
# ----------------------------
# Step 4: Define Agent Functions for LangGraph Nodes
# ----------------------------
def compliance_agent(data: dict) -> dict:
    """
    Retrieves and analyzes FATCA/regulatory content from the PDFs.
    """
    query = data['query']
    fatca_query = f"FATCA: {query}"
    compliance_result = qa_chain.run(fatca_query)
    data['compliance'] = compliance_result
    print("Compliance Agent Output:\n", compliance_result)
    return data
def collaborative_synthesis_agent(data: dict) -> dict:
    """
    Two synthesis agents collaborate by exchanging messages to produce a refined synthesis.
    Now only using the compliance (FATCA) data.
    """
    compliance_result = data.get('compliance', '')
    
    # Generate initial drafts from two synthesis sub-agents
    draft_A = llm(
        f"Agent A: Based on the following FATCA data:\n\n{compliance_result}\n\nProvide an initial synthesis summary."
    )
    draft_B = llm(
        f"Agent B: Based on the following FATCA data:\n\n{compliance_result}\n\nProvide an initial synthesis summary."
    )
    
    # Simulate an exchange for refinement (one round)
    refined_A = llm(
        f"Agent A: Here is Agent B's draft:\n\n{draft_B}\n\nRefine your synthesis using this feedback and the FATCA data:\n\n{compliance_result}"
    )
    refined_B = llm(
        f"Agent B: Here is Agent A's refined draft:\n\n{refined_A}\n\nRefine your synthesis further using this feedback and the FATCA data:\n\n{compliance_result}"
    )
    
    # Aggregate both refined drafts into a final synthesis
    final_synthesis = llm(
        f"Combine the following refined drafts into a final unified synthesis:\n\n"
        f"Agent A: {refined_A}\n\nAgent B: {refined_B}\n\nFinal Synthesis:"
    )
    
    data['synthesized'] = final_synthesis
    print("Collaborative Synthesis Output:\n", final_synthesis)
    return data

def supervisor_agent(data: dict) -> dict:
    """
    Aggregates outputs from the Compliance and Synthesis agents,
    merging and selecting the final candidate output.
    """
    compliance_result = data.get('compliance', '')
    synthesis_result = data.get('synthesized', '')
    
    supervisor_prompt = (
        f"Supervisor: Given the following outputs, compile, aggregate, merge, and select the most comprehensive and accurate insights.\n\n"
        f"Compliance Output:\n{compliance_result}\n\n"
        f"Synthesis Output:\n{synthesis_result}\n\n"
        "Final Aggregated Output:"
    )
    final_output = llm(supervisor_prompt)
    data['supervised'] = final_output
    print("Supervisor Output:\n", final_output)
    return data

def report_agent(data: dict) -> dict:
    """
    Formats the supervisor's aggregated output into a detailed analyst report.
    """
    supervised_output = data.get('supervised', '')
    final_report = llm(
        f"Create a detailed analyst report based on the following aggregated output:\n\n"
        f"{supervised_output}\n\nReport:"
    )
    data['report'] = final_report
    print("Final Report:\n", final_report)
    return data


In [None]:
# ----------------------------
# Step 5: Build the Orchestration Graph Using LangGraph
# ----------------------------
graph = Graph()

# Create nodes for each agent
node_compliance = Node("compliance", func=compliance_agent)
node_collab_synthesis = Node("collaborative_synthesis", func=collaborative_synthesis_agent)
node_supervisor = Node("supervisor", func=supervisor_agent)
node_report = Node("report", func=report_agent)

# Add nodes to the graph
graph.add_node(node_compliance)
graph.add_node(node_collab_synthesis)
graph.add_node(node_supervisor)
graph.add_node(node_report)

# Define the graph edges (dependencies)
# The collaborative synthesis node depends on the compliance output.
graph.add_edge("compliance", "collaborative_synthesis")
# The supervisor agent gathers outputs from both compliance and synthesis nodes.
graph.add_edge("compliance", "supervisor")
graph.add_edge("collaborative_synthesis", "supervisor")
# The final report is generated from the supervisor's output.
graph.add_edge("supervisor", "report")

# ----------------------------
# Step 6: Execute the Orchestrated Workflow
# ----------------------------
if __name__ == "__main__":
    # Define the initial query.
    initial_data = {
        "query": "Summarize the compliance impact of recent FATCA amendments."
    }
    
    result_data = graph.run(initial_data=initial_data)
    final_report = result_data.get("report", "No report generated.")
    print("\n=== Final Analyst Report ===\n", final_report)