In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os
import dotenv
dotenv.load_dotenv()

True

In [16]:
claude_key= os.getenv("CLAUDE_KEY")




In [418]:
paths =[
    "90ade7e39d5e481f9aeb772a19a30234.pdf",
    "English Health Handbook.pdf",
    "English Motor Handbook.pdf",
    "insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf"
]

In [419]:
docs = [PyPDFLoader(url).load() for url in paths]
docs_list = [item for sublist in docs for item in sublist]


In [420]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)

In [421]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [422]:
import lancedb


def lanceDBConnection(embed):
    db = lancedb.connect("/tmp/lancedb")
    table = db.create_table(
        "crag_demo",
        data=[{"vector": embed.embed_query("Hello World"), "text": "Hello World"}],
        mode="overwrite",
    )

    return table

In [423]:
from langchain_community.vectorstores import LanceDB

table = lanceDBConnection(model)

vectorstore = LanceDB.from_documents(
    documents=doc_splits,
    embedding=model,
    connection=table,
)

In [428]:
retriever = vectorstore.as_retriever()

In [429]:
from typing import Dict, TypedDict
from langchain_core.messages import BaseMessage
import json
import operator
from typing import Annotated, Sequence, TypedDict
import langsmith
from langchain.output_parsers.openai_tools import PydanticToolsParser
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import BaseMessage, FunctionMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnablePassthrough
from langchain_core.utils.function_calling import convert_to_openai_tool


class GraphState(TypedDict):
    keys: Dict[str, any]

In [430]:

def retrieve(state):#Node 1. will act as a tool
    """
    Helper function for retrieving documents. 

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("*" * 5, " RETRIEVE ", "*" * 5)
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = retriever.invoke(question)
    
    return {"keys": {"documents": documents, "question": question}}#return the same state dict

In [431]:
from langchain_anthropic import ChatAnthropic

llm = ChatAnthropic(model_name="claude-3-5-sonnet-20240620",api_key=claude_key, streaming=True, model_kwargs=dict(system='You must complete several tasks '))

In [432]:

def grade_documents(state):#node 2
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with relevant documents
    """

    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    
    binary_score: str = Field(description="Relevance score 'yes' or 'no'")

    # LLM

    # Prompt
    prompt = PromptTemplate(
        template="""You are a grader assessing relevance of a policy document to a user question. \n
        Here is the retrieved document: \n\n {context} \n\n
        Here is the user question: {question} \n
        If the document relates to the user question, grade it as relevant. \n
        Ensure that the user's question's specific policy question matches the policy type (auto, pet, life, etc) in the documents.
        Only give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
        Only answer with the words 'yes' or 'no'. Do not provide any other text. \n\n
        """,
        input_variables=["context", "question"],
    )

    # Chain
    chain = prompt | llm 
    print("here2")

    # Score
    filtered_docs = []
    search = "No"  # Default do not opt for web search to supplement retrieval
    for d in documents:
        score = chain.invoke({"question": question, "context": d.page_content})
        score = [score.content]
        print("THIS IS THE SCORE:", score)
        grade2 = score[0]
        if grade2 == "yes":
            print("*" * 5, " RATED DOCUMENT: RELEVANT", "*" * 5)
            filtered_docs.append(d)
        else:
            print("*" * 5, " RATED DOCUMENT: NOT RELEVANT", "*" * 5)
            continue
        
        #if not even half the docs are relevant
        if len(filtered_docs) < int(len(documents)/2):
            search = "Yes"

    return {
        "keys": {
            "documents": filtered_docs,
            "question": question,
            "run_web_search": search,
        }
    }

In [433]:
def generate(state):#node 3. also, end.
    """
    Helper function for generating answers

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains LLM generation
    """
    print("*" * 5, " GENERATE ", "*" * 5)
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    # Prompt
    prompt = PromptTemplate(
    template='''
    You are an assistant for insurance related question-answering tasks. 
    Use the following pieces of insurance policies context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Give as much information as possible. Use a professional tone, and elaborate as much as you can.

    Question: {question} 

    Context: {context} 

    Answer:
    ''',
    input_variables=["question", "context"],)

    # LLM


    # RAG Chain
    rag_chain = prompt | llm 

    # Run generation
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {
        "keys": {"documents": documents, "question": question, "generation": generation}
    }

In [434]:
def transform_query(state):#node 4
    """
    Helper function for transforming the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("*" * 5, "TRANSFORM QUERY", "*" * 5)
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    # Create a prompt template with format instructions and the query
    prompt = PromptTemplate(
        template="""You are generating questions that is well optimized for retrieval. \n
        Look at the input and try to reason about the underlying sematic intent / meaning. \n
        Only return the question, no further explanation or text.\n
        Here is the initial question:
        \n --------- \n
        {question}
        \n --------- \n
        Formulate an improved question: """,
        input_variables=["question"],
    )

    model = llm

    # Prompt
    chain = prompt | model | StrOutputParser()
    better_question = chain.invoke({"question": question})
    print(better_question)

    return {"keys": {"documents": documents, "question": better_question}}

In [435]:
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_KEY")

In [436]:
def decide_to_generate(state):#node 5
    """
    Helper function to determine whether to generate an answer or re-generate a question for web search.

    Args:
        state (dict): The current state of the agent, including all keys.

    Returns:
        str: Next node to call
    """

    print("*" * 5, " DECIDE TO GENERATE ", "*" * 5)
    state_dict = state["keys"]
    search = state_dict["run_web_search"]

    if search == "Yes":
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print("*" * 5, " DECISION: TRANSFORM QUERY and RUN WEB SEARCH ", "*" * 5)
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("*" * 5, " DECISION: GENERATE ", "*" * 5)
        return "generate"

In [437]:
from tavily import TavilyClient

tavily_client = TavilyClient(api_key="tvly")

In [438]:
def web_search(state):#node 6
    """
    Helper function to do Web search based on the re-phrased question using Tavily API.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with appended web results
    """

    print("*" * 5, " WEB SEARCH ", "*" * 5)
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]

    tool = tavily_client.search(    query=question,
                                    search_depth="advanced",
                                    include_answer=True,
                                    include_domains=["https://www.acko.com/car-insurance/irdai-rules/", "https://www.lexisnexis.in/blogs/insurance-law-in-india/."])

    docs = tool["answer"]
    #web_results = "\n".join([d["content"] for d in docs])
    #print(web_results)
    web_results = Document(page_content=docs)
    documents.append(web_results)

    return {"keys": {"documents": documents, "question": question}}

In [439]:
from langgraph.graph import END, StateGraph

workflow = StateGraph(GraphState)#inherit empty state

# Define the nodes
#nodes work by (node_name, function_of_node)
workflow.add_node("retrieve", retrieve)  # retrieve docs
workflow.add_node("grade_documents", grade_documents)  # grade retrieved docs
workflow.add_node("generate", generate)  # generate answers
workflow.add_node("transform_query", transform_query)  # transform_query for web search
workflow.add_node("web_search", web_search)  # web search

# Build graph
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(#conditional edges are based on a condition, which is a function that returns a boolean value
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_edge("transform_query", "web_search")
workflow.add_edge("web_search", "generate")
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()


In [440]:
op = app.invoke({"keys": {"question": "How long can I stay in the hospital, maximum?"}})

*****  RETRIEVE  *****
here2
THIS IS THE SCORE: ['yes']
*****  RATED DOCUMENT: RELEVANT *****
THIS IS THE SCORE: ['no']
*****  RATED DOCUMENT: NOT RELEVANT *****
THIS IS THE SCORE: ['yes']
*****  RATED DOCUMENT: RELEVANT *****
THIS IS THE SCORE: ['no']
*****  RATED DOCUMENT: NOT RELEVANT *****
*****  DECIDE TO GENERATE  *****
*****  DECISION: TRANSFORM QUERY and RUN WEB SEARCH  *****
***** TRANSFORM QUERY *****
What is the maximum allowed length of stay for patients in a hospital?
*****  WEB SEARCH  *****
*****  GENERATE  *****


In [441]:
x = dict(op)

In [442]:
final_op=x["keys"]["generation"].content

In [443]:
final_op

'Based on the provided context, there is no specific maximum allowed length of stay mentioned for patients in a hospital. The policy documents do not define a strict limit on the duration of hospitalization.\n\nHowever, the context does provide some relevant information about hospitalization:\n\n1. The policy defines "Hospitalisation" as admission in a Hospital for a minimum period of 24 consecutive hours of In-Patient Care, except for specified procedures/treatments where admission could be for less than 24 consecutive hours.\n\n2. The policy covers "Inpatient Care," which is defined as treatment for which the insured person has to stay in a hospital for more than 24 hours for a covered event.\n\n3. The policy emphasizes "Medically Necessary" treatment, which is defined as treatment that:\n   a) Is required for the medical management of the illness or injury\n   b) Must not exceed the level of care necessary to provide safe, adequate, and appropriate medical care\n   c) Must have been