# Retrieval-augmented generative AI pipeline
### to provide structured answeres to questions about documents

specifically, Insurance product disclosure statements

Author: AJ Duncanson, heavily borrowing from Thu Vu, see readme.

## 0. Setting up

In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb langchain_chroma
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

You should consider upgrading via the '/Users/aj.duncanson/.pyenv/versions/3.10.0/envs/env310/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/aj.duncanson/.pyenv/versions/3.10.0/envs/env310/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma


from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

### Define our LLM and our set of questions

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
# llm.invoke("Tell me a joke about cats")

Define PDS set

In [6]:
PDS_folder = "/Users/aj.duncanson/projects/structured-rag-pdf/pds/"
results_folder = "/Users/aj.duncanson/projects/structured-rag-pdf/results/"


In [7]:
# provider = "QBE"
# PDSs = [
#     "QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf"
# ]
# product_name = "Comprehensive Car Insurance"

In [8]:
# provider = "Rollin"
# PDSs = [
#     "1747889291-rollin-car-insurance-pds-v4-29-apr-2025.pdf",
#     "1747889277-rollin-car-insurance-ped-v2-29-apr-2025.pdf"
# ]
# product_name = "Comprehensive Car Insurance"

In [9]:
# provider = "Youi"
# PDSs = [
#     "youi car-pds 20250702.pdf"
# ]
# product_name = "Comprehensive Car Insurance"

In [10]:
provider = "Budget Direct"
PDSs = [
    "Budget Direct car PDS A.pdf",
    "Budget Direct Car Gold PDS B.pdf"
]
product_name = "Comprehensive Gold Car Insurance"

Define question set

In [11]:

question_file = "car_insurance_questions.csv"
question_column = 'questions 202507-08'

questions = pd.read_csv(PDS_folder+question_file)
data_points = questions["data_point"]
questions = questions[question_column]

questions = [q + ", if I have the " + product_name + " product." for q in questions]


## 1. Process PDF document

### Load PDF document(s)

In [12]:

pages = []
for p in PDSs:
    loader = PyPDFLoader(PDS_folder + p)
    these_pages = loader.load()
    pages.extend(these_pages)

pages

[Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate': '2025-02-10T03:22:15+00:00', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/pds/Budget Direct car PDS A.pdf', 'total_pages': 40, 'page': 0, 'page_label': '1'}, page_content='Car Insurance\nProduct Disclosure Statement - Part A\nCar Insurance Policy\nGeneral Terms and Conditions\nThis document prepared on 21 September 2023'),
 Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate': '2025-02-10T03:22:15+00:00', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/pds/Budget Direct car PDS A.pdf', 'total_pages': 40, 'page': 1, 'page_label': '2'}, page_content='Product Issuer: Auto & General Insurance Company Limited\nABN 42 111 586 353\nAFS Licence No 285571\nRegistered Office: Level 13, 9 Sherwood Road, Toowong QLD 4066'),
 Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate':

### Split document

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)
chunks[0]

Document(metadata={'producer': '', 'creator': 'Quadient~Inspire Designer~16.0.635.4', 'creationdate': '2025-02-10T03:22:15+00:00', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/pds/Budget Direct car PDS A.pdf', 'total_pages': 40, 'page': 0, 'page_label': '1'}, page_content='Car Insurance\nProduct Disclosure Statement - Part A\nCar Insurance Policy\nGeneral Terms and Conditions\nThis document prepared on 21 September 2023')

### Create embeddings

In [14]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

In [15]:
# from langchain.evaluation import load_evaluator

# evaluator = load_evaluator(evaluator="embedding_distance", 
#                             embeddings=embedding_function)

# evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

In [16]:
# evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

### Create vector database

In [17]:
import uuid

def create_vectorstore(chunks, embedding_function
                       #, vectorstore_path
                       ):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        #persist_directory = vectorstore_path
                                        )
    
    return vectorstore

In [18]:
# ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

# # Ensure that only unique docs with unique ids are kept
# unique_ids = set()
# unique_chunks = []

# unique_chunks = [] 
# for chunk, id in zip(chunks, ids):     
#     if id not in unique_ids:       
#         unique_ids.add(id)
#         unique_chunks.append(chunk) 

In [19]:
# unique_chunks

In [20]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 #vectorstore_path="vectorstore_chroma"
                                 )

## 2. Query for relevant data

In [21]:
# Load vectorstore
#vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [22]:
# Create retriever 
retriever = vectorstore.as_retriever(search_type="similarity")

# relevant_chunks = retriever.invoke("Who underwrites this policy")
# relevant_chunks

In [23]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. 
Your task is to answer the question about specific details of the insurance policy described in the retrieved context.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [24]:

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [25]:
# Concatenate context text
#context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# prompt = prompt_template.format(context=context_text, 
#                                 question="Who underwrites this policy?")
# print(prompt)

In [26]:
# llm.invoke(prompt)

### Using Langchain Expression Language

In [27]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [28]:

# rag_chain = (
#             {"context": retriever | format_docs, "question": RunnablePassthrough()}
#             | prompt_template
#             | llm
#         )
# rag_chain.invoke("What's the name of the insurance product?")

## 4. Generate structured responses

### Still exploring the structure that's most useful for our needs, and also using very early draft prompts.

In [29]:
# class AnswerWithSources(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: str = Field(description="Answer to question")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

# class ExtractedInfoWithReasoning(BaseModel):
#     """Extracted information about the cover item"""
#     summary_details: AnswerWithSources
#     dollar_limit: AnswerWithSources
#     conditions: AnswerWithSources
#     exclusions: AnswerWithSources

# class JustInfo(BaseModel):
#     """Extracted information about the cover item"""
#     summary: str = Field(description="Answer to question")
#     dollar_limit: str = Field(description="Answer to question")
#     conditions: str = Field(description="Answer to question")
#     exclusions: str = Field(description="Answer to question")

class ExtractedInfo(BaseModel):
    """Extracted information about the cover item"""
    this_item_of_cover_is_included_as_standard: bool = Field("Cover is included as standard, not optional")
    this_item_of_cover_is_optional: bool = Field("Cover is optional, not included as standard")
    this_item_is_not_covered_at_all: bool = Field("This cover is not provided at all")

    #is_this_part_of_the_standard_cover_or_an_optional_cover_or_not_covered:  str = Field(description="Standard Cover or Optional Cover or Not Covered")
    summary: str = Field(description="Summary of answer to question, including all relevant numerical limits")
    #summary_2: str = Field(description="A concise summary of the cover provided")
    list_of_numerical_limits_related_to_this_item_of_cover: str = Field("List the numerical limits that apply to this specific item of cover including dollar limits, limits on the number of days the benefit can be paid, and limits to the number of years or number of kilometers used in determining whether this cover applies") 
    #numerical_limits_to_the_dollars_or_distance_or_timeframe_2: str = Field("Answer to question") 
    #dollar_limit: str = Field(description="Answer to question")
    #other_limits_to_cover: str = Field(description="What other limits apply to the cover, other than dollar amounts?")
    #other_limits_to_cover_2: str = Field(description="What distance, time period or other limits apply to the cover, other than dollar amounts?")
    conditions: str = Field(description="Answer to question")
    exclusions: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

In [30]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )



### Get structured responses and compile into a result format

In [31]:
# Get the structured responses for each question in the array

structured_responses = [rag_chain.invoke(q).model_dump() for q in questions]

In [32]:
structured_responses[0]

{'this_item_of_cover_is_included_as_standard': True,
 'this_item_of_cover_is_optional': False,
 'this_item_is_not_covered_at_all': False,
 'summary': 'Personal effects, including child seats or capsules, are covered as an additional benefit under the Comprehensive Gold Car Insurance policy; specific dollar limits apply as per the Additional Benefits section of the Product Disclosure Statement (PDS).',
 'list_of_numerical_limits_related_to_this_item_of_cover': 'Refer to the Additional Benefits section in PDS Part B for specific dollar limits on personal effects.',
 'conditions': 'For coverage of personal effects to apply, specific eligibility criteria must be met as stated in the PDS.',
 'exclusions': 'There are exclusions related to personal effects theft or damage if the theft is not reported to the police.',
 'sources': "For the event(s) and amount covered, please refer to the section entitled 'Additional Benefits' in PDS Part B.",
 'reasoning': 'The answer is derived from the docume

In [33]:
# Store in a df 

df = pd.DataFrame(structured_responses)

df.insert(0, "provider", provider)
df.insert(1, "PDS", ", ".join(PDSs))
df.insert(2, "data_point", data_points)
df.insert(3, "question", questions)
df 



Unnamed: 0,provider,PDS,data_point,question,this_item_of_cover_is_included_as_standard,this_item_of_cover_is_optional,this_item_is_not_covered_at_all,summary,list_of_numerical_limits_related_to_this_item_of_cover,conditions,exclusions,sources,reasoning
0,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",personal items,"Give me a summary, dollar limit, any other lim...",True,False,False,"Personal effects, including child seats or cap...",Refer to the Additional Benefits section in PD...,"For coverage of personal effects to apply, spe...",There are exclusions related to personal effec...,"For the event(s) and amount covered, please re...",The answer is derived from the document that i...
1,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",choice of repairer,"Can I choose my own repairer?, if I have the C...",False,True,False,If you have the Choice of Repairer optional co...,,If you have the Choice of Repairer optional co...,If your preferred repairer is not from the rep...,If you have the Choice of Repairer optional co...,The context specifies that the Choice of Repai...
2,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",agreed value,Can I have my vehicle insured for an agreed va...,False,False,False,"Yes, you can have your vehicle insured for an ...",,Eligibility criteria apply for the agreed valu...,None specified regarding the agreed value opti...,Summary of Cover states 'Choice of Market Valu...,The provided context mentions a choice between...
3,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",market value,Can I have my vehicle insured to cover its act...,True,False,False,"If you have Comprehensive Gold Car Insurance, ...",,Eligibility criteria apply when choosing the c...,,Your Insurance Certificate shows which of thes...,The context specifies that the Comprehensive c...
4,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",agreed value or market value,"If the car is a total loss, does this insuranc...",True,False,False,If the car is a total loss under the Comprehen...,,,,"If the car is a total loss, we will:\n- pay yo...","According to the provided context, the insuran..."
5,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",new car if written off,Will a new replacement car be provided if my c...,False,False,False,"If your car is a total loss, and you purchased...",New replacement car provided within 'the perio...,This benefit applies if the car was purchased ...,The benefit does not apply if the car had any ...,New Car Replacement ± If you purchased the car...,The answer is based on the conditions outlined...
6,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",windscreen cover,Give me the excess that applies to windscreen ...,False,True,False,"For windscreen claims, the excess amount is th...",The specific 'Window glass only' excess amount...,The 'Reduced Window Glass Excess' option is on...,No explicit exclusions were mentioned regardin...,This optional benefit is only available with ©...,The response summarizes the information from t...
7,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",windscreen cover 2,Does the insurance policy offer any windscreen...,False,True,False,The policy provides an optional benefit called...,The specific Window glass only excess amount i...,This benefit is only available if it is shown ...,No specific exclusions were mentioned for this...,This optional benefit is only available with ©...,The context indicates that the Reduced Window ...
8,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",windscreen cover 3,Does the policy excess apply to windscreen cla...,False,True,False,The policy excess applies to windscreen claims...,The specific excess amount for windscreen clai...,The reduced window glass excess benefit is opt...,Not applicable since the context does not ment...,3PRODUCT DISCLOSURE STATEMENT - PART B: Reduce...,The information indicates that the optional be...
9,Budget Direct,"Budget Direct car PDS A.pdf, Budget Direct Car...",emergency repairs without assessment,"Give me a summary, dollar limit, any other lim...",True,False,False,Emergency repairs cover is provided as part of...,,Emergency repairs are covered only to the exte...,There are no specific exclusions related to em...,Emergency Repairs\nEssential Repairs\nEmergenc...,The summary includes the understanding that em...


In [34]:
# save to csv file

timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

df.to_csv(results_folder+"pds_output_"+provider+"_"+timestamp+".csv", index=False)
          