# Retrieval-augmented generative AI pipeline
### to provide structured answeres to questions about documents

specifically, Insurance product disclosure statements

Author: AJ Duncanson, heavily borrowing from Thu Vu, see readme.

## 0. Setting up

In [758]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb langchain_chroma
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

You should consider upgrading via the '/Users/aj.duncanson/.pyenv/versions/3.10.0/envs/env310/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/aj.duncanson/.pyenv/versions/3.10.0/envs/env310/bin/python -m pip install --upgrade pip' command.[0m


In [759]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma


from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [760]:
load_dotenv()

True

In [761]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

### Define our LLM and our set of questions

In [762]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
# llm.invoke("Tell me a joke about cats")

Define PDS set

In [763]:
PDS_folder = "/Users/aj.duncanson/projects/structured-rag-pdf/pds/"
results_folder = "/Users/aj.duncanson/projects/structured-rag-pdf/results/"


In [764]:
# provider = "QBE"
# PDSs = [
#     "QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf"
# ]

In [765]:
# provider = "Rollin"
# PDSs = [
#     "1747889291-rollin-car-insurance-pds-v4-29-apr-2025.pdf"
# ]

In [None]:
provider = "Youi"
PDSs = [
    "youi car-pds 20250702.pdf"
]
product_name = "Comprehensive Car Insurance"

In [None]:
# provider = "Budget Direct"
# PDSs = [
#     "Budget Direct car PDS A.pdf",
#     "Budget Direct Car Gold PDS B.pdf"
# ]
# product_name = "Comprehensive Gold Car Insurance"

Define question set

In [None]:

question_file = "car_insurance_questions.csv"
question_column = 'questions 20250702-06'

questions = pd.read_csv(PDS_folder+question_file)
data_points = questions["data_point"]
questions = questions[question_column]

questions = [q + ", ensuring that you only provide details relevant to the " + product_name + " product." for q in questions]


## 1. Process PDF document

### Load PDF document(s)

In [769]:

pages = []
for p in PDSs:
    loader = PyPDFLoader(PDS_folder + p)
    these_pages = loader.load()
    pages.extend(these_pages)

pages

[Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate': '2025-02-10T03:22:15+00:00', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/pds/Budget Direct car PDS A.pdf', 'total_pages': 40, 'page': 0, 'page_label': '1'}, page_content='Car Insurance\nProduct Disclosure Statement - Part A\nCar Insurance Policy\nGeneral Terms and Conditions\nThis document prepared on 21 September 2023'),
 Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate': '2025-02-10T03:22:15+00:00', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/pds/Budget Direct car PDS A.pdf', 'total_pages': 40, 'page': 1, 'page_label': '2'}, page_content='Product Issuer: Auto & General Insurance Company Limited\nABN 42 111 586 353\nAFS Licence No 285571\nRegistered Office: Level 13, 9 Sherwood Road, Toowong QLD 4066'),
 Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate':

### Split document

In [770]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Create embeddings

In [771]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

In [772]:
# from langchain.evaluation import load_evaluator

# evaluator = load_evaluator(evaluator="embedding_distance", 
#                             embeddings=embedding_function)

# evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

In [773]:
# evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

### Create vector database

In [774]:
import uuid

def create_vectorstore(chunks, embedding_function
                       #, vectorstore_path
                       ):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        #persist_directory = vectorstore_path
                                        )
    
    return vectorstore

In [775]:
# ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

# # Ensure that only unique docs with unique ids are kept
# unique_ids = set()
# unique_chunks = []

# unique_chunks = [] 
# for chunk, id in zip(chunks, ids):     
#     if id not in unique_ids:       
#         unique_ids.add(id)
#         unique_chunks.append(chunk) 

In [776]:
# unique_chunks

In [777]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 #vectorstore_path="vectorstore_chroma"
                                 )

## 2. Query for relevant data

In [778]:
# Load vectorstore
#vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [779]:
# Create retriever 
retriever = vectorstore.as_retriever(search_type="similarity")

# relevant_chunks = retriever.invoke("Who underwrites this policy")
# relevant_chunks

In [780]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. 
Your task is to answer the question about specific details of the insurance policy described in the retrieved context.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

In [781]:
PROMPT_TEMPLATE

"\nYou are an assistant for question-answering tasks. \nYour task is to answer the question about specific details of the insurance policy described in the retrieved context.\nUse the following pieces of retrieved context to answer\nthe question. If you don't know the answer, say that you\ndon't know. DON'T MAKE UP ANYTHING.\n\n{context}\n\n---\n\nAnswer the question based on the above context: {question}\n"

## 3. Generate responses

In [782]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [783]:

# prompt = prompt_template.format(context=context_text, 
#                                 question="Who underwrites this policy?")
# print(prompt)

In [784]:
# llm.invoke(prompt)

### Using Langchain Expression Language

In [785]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [786]:


# rag_chain = (
#             {"context": retriever | format_docs, "question": RunnablePassthrough()}
#             | prompt_template
#             | llm
#         )
# rag_chain.invoke("What's the name of the insurance product?")

In [787]:
# rag_chain.invoke("What is the cover limit in case of rental car after an accident?")

In [788]:
# rag_chain.invoke("Is cover is provided for baby capsules, and is there a dollar limit")

In [789]:
# rag_chain.invoke("What is the maximum amount of cover for legal liability?")

## 4. Generate structured responses

### Still exploring the structure that's most useful for our needs, and also using very early draft prompts.

In [790]:
# class AnswerWithSources(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: str = Field(description="Answer to question")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

# class ExtractedInfoWithReasoning(BaseModel):
#     """Extracted information about the cover item"""
#     summary_details: AnswerWithSources
#     dollar_limit: AnswerWithSources
#     conditions: AnswerWithSources
#     exclusions: AnswerWithSources

# class JustInfo(BaseModel):
#     """Extracted information about the cover item"""
#     summary: str = Field(description="Answer to question")
#     dollar_limit: str = Field(description="Answer to question")
#     conditions: str = Field(description="Answer to question")
#     exclusions: str = Field(description="Answer to question")

class ExtractedInfo(BaseModel):
    """Extracted information about the cover item"""
    this_item_of_cover_is_included_as_standard: bool = Field("Cover is included as standard, not optional")
    this_item_of_cover_is_optional: bool = Field("Cover is optional, not included as standard")
    this_item_is_not_covered_at_all: bool = Field("This cover is not provided at all")

    #is_this_part_of_the_standard_cover_or_an_optional_cover_or_not_covered:  str = Field(description="Standard Cover or Optional Cover or Not Covered")
    summary: str = Field(description="Summary of answer to question, including all relevant numerical limits")
    #summary_2: str = Field(description="A concise summary of the cover provided")
    list_of_numerical_limits_related_to_this_item_of_cover: str = Field("List the numerical limits that apply to this specific item of cover including dollar limits, limits on the number of days the benefit can be paid, and limits to the number of years or number of kilometers used in determining whether this cover applies") 
    #numerical_limits_to_the_dollars_or_distance_or_timeframe_2: str = Field("Answer to question") 
    #dollar_limit: str = Field(description="Answer to question")
    #other_limits_to_cover: str = Field(description="What other limits apply to the cover, other than dollar amounts?")
    #other_limits_to_cover_2: str = Field(description="What distance, time period or other limits apply to the cover, other than dollar amounts?")
    conditions: str = Field(description="Answer to question")
    exclusions: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

In [791]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )



In [792]:
# rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to personal items cover.")

### Get structured responses and compile into a result format

In [793]:
# Get the structured responses for each question in the array

structured_responses = [rag_chain.invoke(q).model_dump() for q in questions]

In [796]:
structured_responses[0]

{'this_item_of_cover_is_included_as_standard': True,
 'this_item_of_cover_is_optional': False,
 'this_item_is_not_covered_at_all': False,
 'summary': 'Comprehensive Gold Car Insurance includes cover for personal effects, which encompasses items like child seats or capsules. There is a specific limit of $1,000 for stolen keys and locks, but there is no explicit limit mentioned for personal items.',
 'list_of_numerical_limits_related_to_this_item_of_cover': 'Up to $1,000 for stolen keys and locks; no specific dollar limit for personal effects mentioned.',
 'conditions': 'Personal effects coverage is included in the Comprehensive policy, subject to limits noted in the policy document.',
 'exclusions': "General exclusions apply, including loss of value or depreciation of personal items, loss of use or other financial loss arising from an incident, and costs that occur due to the mechanical or electronic breakdown of associated items, unless they are guaranteed under the insurer's Quality G

In [797]:
# Store in a df 

df = pd.DataFrame(structured_responses)

df.insert(0, "provider", provider)
df.insert(1, "PDS", ", ".join(PDSs))
df.insert(2, "data_point", data_points)
df.insert(3, "question", questions)
df 



Unnamed: 0,provider,PDS,data_point,question,this_item_of_cover_is_included_as_standard,this_item_of_cover_is_optional,this_item_is_not_covered_at_all,summary,list_of_numerical_limits_related_to_this_item_of_cover,conditions,exclusions,sources,reasoning
0,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",personal items,"Give me a summary, dollar limit, any other lim...",True,False,False,Comprehensive Gold Car Insurance includes cove...,"Up to $1,000 for stolen keys and locks; no spe...",Personal effects coverage is included in the C...,"General exclusions apply, including loss of va...","Summary of Cover, Policy Benefits and Options,...",The extracted information has been gathered ba...
1,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",choice of repairer,Can I choose my own repairer?only provide deta...,False,True,False,"With Comprehensive Gold Car Insurance, you can...",,You must provide a quotation from your preferr...,If 'Choice of Repairer' is not included in you...,Choice of Repairer (optional) This optional be...,The context clearly states that the ability to...
2,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",agreed value,Can I have my vehicle insured for an agreed va...,True,False,False,You can have your vehicle insured for an agree...,,Eligibility criteria apply for selecting an in...,,"If you have an agreed value policy, you should...",The provided context specifies that for those ...
3,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",market value,Can I have my vehicle insured for the market v...,True,False,False,"Yes, under the Comprehensive Gold Car Insuranc...",The maximum payout for a claim on a replacemen...,You need to specify an amount for after market...,Disability modifications and campervan/motorho...,The replacement car will be covered for the sa...,The question specifically asked about the Comp...
4,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",agreed value or market value,"If the car is a total loss, does this insuranc...",True,False,False,If the car is a total loss under Comprehensive...,,The agreed value includes the value of any acc...,,"If the car is a total loss, we will: pay you t...",The context specifies that for a total loss un...
5,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",new car if written off,Will a new replacement car be provided if my c...,True,False,False,If your car is written off and you purchased i...,Limit of 90 days for sourcing a replacement ca...,The new replacement car benefit applies if you...,The benefit does not apply if the car had any ...,New Car Replacement ± If you purchased the car...,This answer is based on the section that outli...
6,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",windscreen cover,Give me the excess that applies to windscreen ...,False,True,False,The basic excess applies to windscreen claims ...,The specific amount of the excess and the redu...,The reduced window glass excess is only availa...,Exclusions are not specified in the provided c...,3PRODUCT DISCLOSURE STATEMENT - PART B: Reduce...,The information specifies that there is a basi...
7,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",emergency repairs without assessment,"Give me a summary, dollar limit, any other lim...",True,False,False,Emergency travel & accommodation cover provide...,Up to $500 for emergency travel & accommodation.,The emergency travel & accommodation cover is ...,"General exclusions apply; for example, loss of...",Emergency travel & accommodation\nUp to $500 i...,The summary clearly indicates the limit placed...
8,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",emergency accommodation,"Give me a summary, dollar limit, any distance ...",True,False,False,Emergency accommodation and transport cover is...,Up to $500 for emergency travel & accommodatio...,You should arrange and pay for the emergency a...,Specific exclusions apply as per section 4; ge...,"Emergency Accommodation, Transport and Repairs...",The context clearly outlines the coverage for ...
9,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",emergency accommodation 2,Regarding insurance cover for emergency accomm...,True,False,False,Emergency accommodation and transport is cover...,No specific numerical limit provided for emerg...,The emergency accommodation and transport cove...,No exclusions were explicitly listed for the e...,"Emergency Accommodation, Transport and Repairs...",The question specifically asks about the detai...


In [798]:
# save to csv file

timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

df.to_csv(results_folder+"pds_output_"+provider+"_"+timestamp+".csv", index=False)
          