# Retrieval-augmented generative AI pipeline
### to provide structured answeres to questions about documents

specifically, Insurance product disclosure statements

Author: AJ Duncanson, heavily borrowing from Thu Vu, see readme.

## 0. Setting up

In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb langchain_chroma
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [None]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma


from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

### Define our LLM and our set of questions

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
# llm.invoke("Tell me a joke about cats")

Define PDS set

In [None]:
PDS_folder = "/Users/aj.duncanson/projects/structured-rag-pdf/pds/"
results_folder = "/Users/aj.duncanson/projects/structured-rag-pdf/results/"


In [None]:
# provider = "QBE"
# PDSs = [
#     "QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf"
# ]
# product_name = "Comprehensive Car Insurance"

In [None]:
# provider = "Rollin"
# PDSs = [
#     "1747889291-rollin-car-insurance-pds-v4-29-apr-2025.pdf"
# ]
# product_name = "Comprehensive Car Insurance"

In [None]:
# provider = "Youi"
# PDSs = [
#     "youi car-pds 20250702.pdf"
# ]
# product_name = "Comprehensive Car Insurance"

In [None]:
provider = "Budget Direct"
PDSs = [
    "Budget Direct car PDS A.pdf",
    "Budget Direct Car Gold PDS B.pdf"
]
product_name = "Comprehensive Gold Car Insurance"

Define question set

In [None]:

question_file = "car_insurance_questions.csv"
question_column = 'questions 20250702-06'

questions = pd.read_csv(PDS_folder+question_file)
data_points = questions["data_point"]
questions = questions[question_column]

questions = [q + ", if I have the " + product_name + " product." for q in questions]


## 1. Process PDF document

### Load PDF document(s)

In [None]:

pages = []
for p in PDSs:
    loader = PyPDFLoader(PDS_folder + p)
    these_pages = loader.load()
    pages.extend(these_pages)

pages

### Split document

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)
chunks[0]

### Create embeddings

In [None]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

In [None]:
# from langchain.evaluation import load_evaluator

# evaluator = load_evaluator(evaluator="embedding_distance", 
#                             embeddings=embedding_function)

# evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

In [None]:
# evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

### Create vector database

In [None]:
import uuid

def create_vectorstore(chunks, embedding_function
                       #, vectorstore_path
                       ):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        #persist_directory = vectorstore_path
                                        )
    
    return vectorstore

In [None]:
# ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

# # Ensure that only unique docs with unique ids are kept
# unique_ids = set()
# unique_chunks = []

# unique_chunks = [] 
# for chunk, id in zip(chunks, ids):     
#     if id not in unique_ids:       
#         unique_ids.add(id)
#         unique_chunks.append(chunk) 

In [None]:
# unique_chunks

In [None]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 #vectorstore_path="vectorstore_chroma"
                                 )

## 2. Query for relevant data

In [None]:
# Load vectorstore
#vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [None]:
# Create retriever 
retriever = vectorstore.as_retriever(search_type="similarity")

# relevant_chunks = retriever.invoke("Who underwrites this policy")
# relevant_chunks

In [None]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. 
Your task is to answer the question about specific details of the insurance policy described in the retrieved context.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [None]:

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [None]:
# Concatenate context text
#context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# prompt = prompt_template.format(context=context_text, 
#                                 question="Who underwrites this policy?")
# print(prompt)

In [None]:
# llm.invoke(prompt)

### Using Langchain Expression Language

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:

# rag_chain = (
#             {"context": retriever | format_docs, "question": RunnablePassthrough()}
#             | prompt_template
#             | llm
#         )
# rag_chain.invoke("What's the name of the insurance product?")

## 4. Generate structured responses

### Still exploring the structure that's most useful for our needs, and also using very early draft prompts.

In [None]:
# class AnswerWithSources(BaseModel):
#     """An answer to the question, with sources and reasoning."""
#     answer: str = Field(description="Answer to question")
#     sources: str = Field(description="Full direct text chunk from the context used to answer the question")
#     reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

# class ExtractedInfoWithReasoning(BaseModel):
#     """Extracted information about the cover item"""
#     summary_details: AnswerWithSources
#     dollar_limit: AnswerWithSources
#     conditions: AnswerWithSources
#     exclusions: AnswerWithSources

# class JustInfo(BaseModel):
#     """Extracted information about the cover item"""
#     summary: str = Field(description="Answer to question")
#     dollar_limit: str = Field(description="Answer to question")
#     conditions: str = Field(description="Answer to question")
#     exclusions: str = Field(description="Answer to question")

class ExtractedInfo(BaseModel):
    """Extracted information about the cover item"""
    this_item_of_cover_is_included_as_standard: bool = Field("Cover is included as standard, not optional")
    this_item_of_cover_is_optional: bool = Field("Cover is optional, not included as standard")
    this_item_is_not_covered_at_all: bool = Field("This cover is not provided at all")

    #is_this_part_of_the_standard_cover_or_an_optional_cover_or_not_covered:  str = Field(description="Standard Cover or Optional Cover or Not Covered")
    summary: str = Field(description="Summary of answer to question, including all relevant numerical limits")
    #summary_2: str = Field(description="A concise summary of the cover provided")
    list_of_numerical_limits_related_to_this_item_of_cover: str = Field("List the numerical limits that apply to this specific item of cover including dollar limits, limits on the number of days the benefit can be paid, and limits to the number of years or number of kilometers used in determining whether this cover applies") 
    #numerical_limits_to_the_dollars_or_distance_or_timeframe_2: str = Field("Answer to question") 
    #dollar_limit: str = Field(description="Answer to question")
    #other_limits_to_cover: str = Field(description="What other limits apply to the cover, other than dollar amounts?")
    #other_limits_to_cover_2: str = Field(description="What distance, time period or other limits apply to the cover, other than dollar amounts?")
    conditions: str = Field(description="Answer to question")
    exclusions: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

In [None]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )



### Get structured responses and compile into a result format

In [None]:
# Get the structured responses for each question in the array

structured_responses = [rag_chain.invoke(q).model_dump() for q in questions]

In [None]:
structured_responses[0]

In [None]:
# Store in a df 

df = pd.DataFrame(structured_responses)

df.insert(0, "provider", provider)
df.insert(1, "PDS", ", ".join(PDSs))
df.insert(2, "data_point", data_points)
df.insert(3, "question", questions)
df 



In [None]:
# save to csv file

timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

df.to_csv(results_folder+"pds_output_"+provider+"_"+timestamp+".csv", index=False)
          