In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

In [2]:
GROQ_API_KEY= "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"

In [3]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader

In [4]:
loader = PyPDFLoader("C:/Users/HP/Desktop/Projects/InsuranceTracker/Health Insurance/Aspire_Prospectus.pdf")
documents = loader.load()

In [5]:
len(documents)

30

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
)

In [7]:
documents = text_splitter.split_documents(documents)

In [8]:
len(documents)

65

In [9]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:········


In [10]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [11]:
retriever = db.as_retriever(search_kwargs={"k": 10})

In [12]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [13]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [14]:
llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")

In [15]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
query = f'''
The given document is either a health or term insurance policy prospectus or brochure. 
Your task is to provide important points from the exclusions for Niva Bupa Health Insurance that users should know before opting for the policy.
Instructions:
- Be as concise as possible.
- Each point should be in bullet points.
'''

In [17]:
rag_chain.invoke(query)

'Here are the important points from the exclusions of Niva Bupa Health Insurance that users should know before opting for the policy:\n\n• Expenses related to the treatment of certain diseases/procedures (e.g., cataract, hernia, osteoarthritis) are excluded until the expiry of 24 months/12 months of continuous coverage.\n• Cosmetic or plastic surgery, or any treatment to change appearance, unless for reconstruction following an Accident, Burn(s) or Cancer.\n• Expenses related to hazardous or adventure sports, including but not limited to, para-jumping, rock climbing, mountaineering, rafting, motor racing, horse racing or scuba diving.\n• Expenses for treatment directly arising from or consequent upon any Insured Person committing or attempting to commit a breach of law with criminal intent.\n• Expenses incurred towards treatment in any Hospital or by any Medical Practitioner or any other provider specifically excluded by the insurer.\n• Treatment for, alcoholism, drug or substance abus

## Structured Output Parsers

In [36]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [37]:
response_schemas = [
    ResponseSchema(name="Subsections", description="Extract all the subsections from a specific section of the document as a list")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [38]:
format_instructions = output_parser.get_format_instructions()

In [39]:
format_instructions

'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"Subsections": string  // Extract all the subsections from a specific section of the document as a list\n}\n```'

In [40]:
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='Subsections', description='Extract all the subsections from a specific section of the document as a list', type='string')])

In [48]:
template_string = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Take the question, and context in the below delimited by triple backticks and use it to answer the specific query.
question: ```{question}```
context: ```{context}```
{format_instructions}
Subsections:
"""

In [49]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
prompt2 = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template_string)  
    ],
    input_variables=["context"],
    partial_variables={"format_instructions": format_instructions},
    output_parser=output_parser # here we add the output parser to the Prompt template
)

In [55]:

from langchain_core.runnables import RunnableParallel

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt2
    | llm
    | StrOutputParser()
)

In [56]:
output = rag_chain.invoke("List all the subsections of Exclusions section.")

In [62]:
import json
output2 = dictionary = json.loads(output)

In [63]:
output2

{'Subsections': ['4.1.3. 30-day waiting period (Code- Excl03)',
  '4.1.4. Investigation & Evaluation (Code-Excl04)',
  '4.1.5. Rest Cure, rehabilitation and respite care (Code-Excl05)',
  '4.1.6. Obesity/ Weight Control (Code-Excl06)',
  '4.1.8. Hazardous or Adventure sports (Code-Excl09)',
  '4.1.9. Breach of law (Code-Excl10)',
  '4.1.10. Excluded Providers (Code-Excl11)',
  '4.1.11. Treatment for, alcoholism, drug or substance abuse or any addictive condition and consequences thereof. (Code-Excl12)',
  '4.1.12. Treatments received in heath hydros, nature cure clinics, spas or similar establishments or private beds registered as a nursing home attached to such establishments or where admission is arranged wholly or partly for domestic reasons. (Code-Excl13)',
  '4.1.13. Refractive Error (Code-Excl15)',
  '4.1.14. Unproven Treatments (Code-Excl16)',
  '4.2.1. Personal Waiting Period',
  '4.2.2. Conflict & Disaster:',
  '4.2.3. External Congenital Anomaly:',
  '4.2.4. Dental treatment:

In [65]:
for i in output2['Subsections'][:5]:
    query = f'''
    The given document is either a health or term insurance policy prospectus or brochure. 
    Your task is to provide important points from the exclusions {i} from the give document that users should know before opting for the policy.
    Instructions:
    - Be as concise as possible.
    - Each point should be in bullet points.
    '''
    prompt = hub.pull("rlm/rag-prompt")
    llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")    
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    print(f"Summary of Exclusion {i}")
    print(rag_chain.invoke(query))
    print("----------------")
    

Summary of Exclusion 4.1.3. 30-day waiting period (Code- Excl03)
Here are the important points from the exclusions 4.1.3. 30-day waiting period (Code- Excl03):

• Expenses related to the treatment of any Illness within 30 days from the first Policy commencement date shall be excluded, except claims arising due to an Accident, provided the same are covered.
• This exclusion shall not apply if the Insured Person has continuous coverage for more than twelve months.
• The within referred waiting period is made applicable to the enhanced Sum Insured in the event of granting higher Sum Insured subsequently.
----------------
Summary of Exclusion 4.1.4. Investigation & Evaluation (Code-Excl04)
Here are the important points from the exclusions section 4.1.4. Investigation & Evaluation (Code-Excl04):

• Expenses related to any admission primarily for diagnostics and evaluation purposes only are excluded.
• Any diagnostic expenses which are not related or not incidental to the current diagnosis a