In [16]:
import os
import json
import boto3
from typing import TypedDict, List, Annotated, Sequence
import operator

# Langchain Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings # Using a common embedding model
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import BaseMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_aws import ChatBedrock # Updated import for Bedrock

# LangGraph Imports
from langgraph.graph import StateGraph, END

from qdrant_client import QdrantClient


In [17]:

# --- Configuration (similar to your notebook) ---
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
QDRANT_COLLECTION_NAME = "documentos_langchain" # Use a new collection name or manage existing
PDF_PATH = "documento.pdf" # Make sure this file exists where the script runs
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Same as your notebook

# Amazon Bedrock Model IDs (ensure these are available in your AWS region)
MODELOS_BEDROCK = {
    "Claude_3.5_Sonnet": "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "Claude_3_Sonnet": "anthropic.claude-3-sonnet-20240229-v1:0",
    "Claude_3_Haiku": "anthropic.claude-3-haiku-20240307-v1:0", # Corrected Haiku model ID for recent versions
    # Add other models if needed
}
SELECTED_GENERATION_MODEL = "Claude_3.5_Sonnet"


In [18]:

# --- Initialize Clients and Models ---
try:
    qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
    print(f"Successfully connected to Qdrant at {QDRANT_HOST}:{QDRANT_PORT}")
except Exception as e:
    print(f"Error connecting to Qdrant: {e}")
    print("Please ensure Qdrant server is running and accessible.")
    exit()

bedrock_runtime = boto3.client('bedrock-runtime')
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

# Initialize the LLM for answer generation
llm = ChatBedrock(
    client=bedrock_runtime,
    model_id=MODELOS_BEDROCK[SELECTED_GENERATION_MODEL],
    model_kwargs={"temperature": 0.7, "max_tokens_to_sample": 500} # anthropic specific, adjust if using other model families
)
if "anthropic" in MODELOS_BEDROCK[SELECTED_GENERATION_MODEL]:
    llm.model_kwargs = {"temperature": 0.8, "max_tokens": 100, "top_p": 0.8} # For Claude 3.5
else: # Example for Titan
    llm.model_kwargs = {"temperature": 0.8, "maxTokenCount": 100}


Successfully connected to Qdrant at localhost:6333


In [19]:

# --- LangGraph State Definition ---
class RAGState(TypedDict):
    question: str
    documents: List[str] # List of retrieved document contents
    answer: str
    ingestion_done: bool # To track if ingestion has been performed

# --- Node Functions for LangGraph ---

def ingest_documents_node(state: RAGState):
    """
    Loads PDF, chunks, embeds, and upserts to Qdrant.
    This node should ideally run only if the collection doesn't exist or is empty.
    """
    print("--- INGESTING DOCUMENTS ---")
    try:
        # Check if collection exists and has vectors
        try:
            collection_info = qdrant_client.get_collection(collection_name=QDRANT_COLLECTION_NAME)
            if collection_info.points_count > 0:
                print(f"Collection '{QDRANT_COLLECTION_NAME}' already exists and has {collection_info.points_count} points. Skipping ingestion.")
                return {"ingestion_done": True}
        except Exception: # pylint: disable=broad-except
            print(f"Collection '{QDRANT_COLLECTION_NAME}' does not exist or is empty. Proceeding with ingestion.")

        loader = PyPDFLoader(PDF_PATH)
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        split_docs = text_splitter.split_documents(docs)
        
        doc_contents = [doc.page_content for doc in split_docs]
        print(f"Generated {len(doc_contents)} chunks from PDF.")

        # Using Qdrant.from_texts to create and populate
        # This will create the collection if it doesn't exist.
        # Ensure the embedding dimension matches your Qdrant setup if pre-existing
        Qdrant.from_texts(
            texts=doc_contents, # Pass the content directly
            embedding=embeddings,
            host=QDRANT_HOST,
            port=QDRANT_PORT,
            collection_name=QDRANT_COLLECTION_NAME,
            prefer_grpc=True # Often recommended for Qdrant
        )
        print(f"Successfully ingested {len(doc_contents)} chunks into Qdrant collection '{QDRANT_COLLECTION_NAME}'.")
        return {"ingestion_done": True}

    except Exception as e:
        print(f"Error during document ingestion: {e}")
        # Decide how to handle failure: raise error, or return a specific state
        raise # Re-raise the exception to stop the graph if ingestion fails critically

def retrieve_documents_node(state: RAGState):
    """
    Retrieves relevant documents from Qdrant based on the question.
    """
    print("--- RETRIEVING DOCUMENTS ---")
    question = state["question"]
    
    # Initialize Qdrant vector store for retrieval
    vector_store = Qdrant(
        client=qdrant_client,
        collection_name=QDRANT_COLLECTION_NAME,
        embeddings=embeddings
    )
    retriever = vector_store.as_retriever(search_kwargs={"k": 5}) # Get top 5 chunks

    retrieved_docs = retriever.invoke(question)
    doc_contents = [doc.page_content for doc in retrieved_docs]
    print(f"Retrieved {len(doc_contents)} documents for question: '{question}'")
    # print("Retrieved contexts:", doc_contents) # For debugging
    return {"documents": doc_contents}

def generate_answer_node(state: RAGState):
    """
    Generates an answer using the LLM based on the question and retrieved documents.
    """
    print("--- GENERATING ANSWER ---")
    question = state["question"]
    documents = state["documents"]
    
    context_text = "\n\n---\n\n".join(documents)
    
    # Using the system prompt structure from your notebook
    # Note: For Claude 3.5 Sonnet and other Anthropic models, provide system prompt at the top level
    # and user messages in the "messages" list.
    
    system_prompt_content = (
        "You are an expert Q&A system. Your task is to answer the user's question based ONLY on the provided context. "
        "Follow these instructions strictly:\n"
        "1. Answer directly and concisely. Get straight to the point.\n"
        "2. Ensure your entire answer is a complete thought and ends naturally. Do not get cut off.\n"
        "3. Do NOT repeat the question in your answer.\n"
        "4. Do NOT use introductory phrases like 'Based on the context provided...', 'The context states...', "
        "'According to the document...', or similar preambles.\n"
        "5. Provide only the answer to the question, nothing else.\n"
        "5. Provide the answer in a single sentence or paragraph, do not enter line breaks.\n\n"
        "Context:\n"
        f"{context_text}"
    )
    
    # For Claude 3.5 Sonnet and other modern Anthropic models via Bedrock
    if "anthropic" in MODELOS_BEDROCK[SELECTED_GENERATION_MODEL]:
         messages = [
            SystemMessage(content=system_prompt_content),
            HumanMessagePromptTemplate.from_template("{question}")
        ]
    else: # Fallback or other models might take system prompt differently
        messages = [
            SystemMessagePromptTemplate.from_template(system_prompt_content), # Older way
            HumanMessagePromptTemplate.from_template("{question}")
        ]

    prompt = ChatPromptTemplate.from_messages(messages)
    
    chain = prompt | llm | StrOutputParser()
    
    print(f"Prompt being sent to LLM (first ~500 chars of context):\nSystem: {system_prompt_content[:500]}...\nUser: {question}")

    answer = chain.invoke({"question": question})
    print(f"Generated answer: {answer}")
    return {"answer": answer}


In [20]:

# --- Build the Graph ---
workflow = StateGraph(RAGState)

# Add nodes
workflow.add_node("ingest_documents", ingest_documents_node)
workflow.add_node("retrieve_documents", retrieve_documents_node)
workflow.add_node("generate_answer", generate_answer_node)

# Define edges
workflow.set_entry_point("ingest_documents")
workflow.add_edge("ingest_documents", "retrieve_documents") # Always retrieve after attempting ingestion
workflow.add_edge("retrieve_documents", "generate_answer")
workflow.add_edge("generate_answer", END)

# Compile the graph
app = workflow.compile()


In [21]:

# --- Functions to process questions (similar to your notebook) ---
def load_questions(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def run_qa_pipeline(qa_pairs):
    results = []
    for i, qa in enumerate(qa_pairs):
        question = qa["Q"]
        expected_answer = qa.get("A", "N/A") # Handle if "A" is not in all dicts
        print(f"\nProcessing Question {i+1}/{len(qa_pairs)}: {question}")
        
        # Run the graph for each question
        # The state needs to be initialized for each run, especially the question
        initial_state = {"question": question, "ingestion_done": False} # ingestion_done will be set by the node
        
        final_state = app.invoke(initial_state)
        
        generated_answer = final_state.get("answer", "Error: No answer generated")
        
        results.append({
            "question": question,
            "expected_answer": expected_answer,
            "generated_answer": generated_answer
        })
    return results


In [22]:

# --- Main Execution ---
if __name__ == "__main__":
    # Ensure PDF exists
    if not os.path.exists(PDF_PATH):
        print(f"Error: PDF document not found at '{PDF_PATH}'. Please provide the correct path.")
        exit()

    # Load questions
    expert_qa_path = "Expert-questions.json"
    non_expert_qa_path = "Not-expert-questions.json"

    if not os.path.exists(expert_qa_path) or not os.path.exists(non_expert_qa_path):
        print(f"Error: Ensure '{expert_qa_path}' and '{non_expert_qa_path}' are in the same directory.")
        exit()
        
    expert_qa = load_questions(expert_qa_path)
    non_expert_qa = load_questions(non_expert_qa_path)

    print("\n--- Processing Non-Expert Questions ---")
    non_expert_results = run_qa_pipeline(non_expert_qa)

    print("\n--- Processing Expert Questions ---")
    expert_results = run_qa_pipeline(expert_qa)

    # Save results
    with open("nonExpert_answers_langchain.json", "w", encoding="utf-8") as f:
        json.dump(non_expert_results, f, ensure_ascii=False, indent=4)
    print("\nSaved non-expert results to nonExpert_answers_langchain.json")

    with open("expert_answers_langchain.json", "w", encoding="utf-8") as f:
        json.dump(expert_results, f, ensure_ascii=False, indent=4)
    print("Saved expert results to expert_answers_langchain.json")

    print("\n--- Pipeline Finished ---")

    # You can print some results for quick inspection
    print("\nSample Non-Expert Result:")
    if non_expert_results:
        print(json.dumps(non_expert_results[0], indent=2, ensure_ascii=False))
    print("\nSample Expert Result:")
    if expert_results:
        print(json.dumps(expert_results[0], indent=2, ensure_ascii=False))


--- Processing Non-Expert Questions ---

Processing Question 1/40: What are ESG risks?
--- INGESTING DOCUMENTS ---
Collection 'documentos_langchain' already exists and has 537 points. Skipping ingestion.
--- RETRIEVING DOCUMENTS ---
Retrieved 5 documents for question: 'What are ESG risks?'
--- GENERATING ANSWER ---
Prompt being sent to LLM (first ~500 chars of context):
System: You are an expert Q&A system. Your task is to answer the user's question based ONLY on the provided context. Follow these instructions strictly:
1. Answer directly and concisely. Get straight to the point.
2. Ensure your entire answer is a complete thought and ends naturally. Do not get cut off.
3. Do NOT repeat the question in your answer.
4. Do NOT use introductory phrases like 'Based on the context provided...', 'The context states...', 'According to the document...', or similar preambles.
5....
User: What are ESG risks?
Generated answer: ESG risks are environmental, social and governance risks that can impa