In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb langchain_chroma
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

You should consider upgrading via the '/Users/aj.duncanson/.pyenv/versions/3.10.0/envs/env310/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/aj.duncanson/.pyenv/versions/3.10.0/envs/env310/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma


from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define our LLM

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why did the cat sit on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 13, 'total_tokens': 33, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-BabdL354Dk881m46XOiamKd7jOtuo', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--9a9a21fa-6a97-4c57-8ae2-210a8eb7a080-0', usage_metadata={'input_tokens': 13, 'output_tokens': 20, 'total_tokens': 33, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Process PDF document

### Load PDF document

In [6]:
loader = PyPDFLoader("/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf")
pages = loader.load()
pages

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.4 (Windows)', 'creationdate': '2023-08-30T13:59:26+10:00', 'moddate': '2023-08-30T13:59:29+10:00', 'trapped': '/False', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf', 'total_pages': 48, 'page': 0, 'page_label': 'cov1'}, page_content='Comprehensive \nCar Insurance\nProduct Disclosure Statement'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.4 (Windows)', 'creationdate': '2023-08-30T13:59:26+10:00', 'moddate': '2023-08-30T13:59:29+10:00', 'trapped': '/False', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf', 'total_pages': 48, 'page': 1, 'page_label': '1'}, page_content='This is an important document about insurance. It explains what is and what is not covered \nunder the insurance policy and your and our oblig

### Split document

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Create embeddings

In [8]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

In [9]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

{'score': 0.1745443723078154}

In [10]:
evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

{'score': 0.22435473882320334}

### Create vector database

In [11]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)
    
    return vectorstore

In [12]:
ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

# Ensure that only unique docs with unique ids are kept
unique_ids = set()
unique_chunks = []

unique_chunks = [] 
for chunk, id in zip(chunks, ids):     
    if id not in unique_ids:       
        unique_ids.add(id)
        unique_chunks.append(chunk) 

In [13]:
unique_chunks

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.4 (Windows)', 'creationdate': '2023-08-30T13:59:26+10:00', 'moddate': '2023-08-30T13:59:29+10:00', 'trapped': '/False', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf', 'total_pages': 48, 'page': 0, 'page_label': 'cov1'}, page_content='Comprehensive \nCar Insurance\nProduct Disclosure Statement'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.4 (Windows)', 'creationdate': '2023-08-30T13:59:26+10:00', 'moddate': '2023-08-30T13:59:29+10:00', 'trapped': '/False', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf', 'total_pages': 48, 'page': 1, 'page_label': '1'}, page_content='This is an important document about insurance. It explains what is and what is not covered \nunder the insurance policy and your and our oblig

In [14]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_chroma")

## 2. Query for relevant data

In [15]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [16]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("Who underwrites this policy")
relevant_chunks

[Document(id='8c3b5b6f-6d5f-5097-8a6b-5fee92200e9b', metadata={'producer': 'Adobe PDF Library 17.0', 'trapped': '/False', 'creationdate': '2023-08-30T13:59:26+10:00', 'moddate': '2023-08-30T13:59:29+10:00', 'source': '/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf', 'page_label': '2', 'page': 2, 'total_pages': 48, 'creator': 'Adobe InDesign 18.4 (Windows)'}, page_content='Preparation date: 31 July 2023\nThis PDS is issued and underwritten by QBE. \nThis PDS does not consider your objectives, financial situation or needs. You should take into account your \npersonal circumstances when considering the information provided to decide if the product is right for you.\nQBE in the community\nWe are committed to giving back to the communities that we operate in. \nThrough Premiums4Good, we invest a portion of customer premiums into \ninvestments that have additional social or environmental features. So, when \nyou choose us as you

In [17]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [18]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="Who underwrites this policy?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

Preparation date: 31 July 2023
This PDS is issued and underwritten by QBE. 
This PDS does not consider your objectives, financial situation or needs. You should take into account your 
personal circumstances when considering the information provided to decide if the product is right for you.
QBE in the community
We are committed to giving back to the communities that we operate in. 
Through Premiums4Good, we invest a portion of customer premiums into 
investments that have additional social or environmental features. So, when 
you choose us as your insurer, your premium automatically does some good.
2

---

of the insureds. We may rely on a request from one insured to change or cancel your 
policy or tell us where a claim payment should be paid. Where a payment is made to one 
ins

In [19]:
llm.invoke(prompt)

AIMessage(content='The policy is underwritten by QBE.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 730, 'total_tokens': 739, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BabdkYOgzCqwOfAnUrJAeTgoHUlY4', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--ea373244-071f-418d-b3e8-40ef6f66d531-0', usage_metadata={'input_tokens': 730, 'output_tokens': 9, 'total_tokens': 739, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Using Langchain Expression Language

In [20]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the name of the insurance product?")

AIMessage(content='The name of the insurance product is QBE Comprehensive Car Insurance.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 587, 'total_tokens': 600, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-Babdl9SvqtuBiIBtejjSZyZTXXKgN', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--f7cb9581-760f-4e9e-91ed-780799e8c957-0', usage_metadata={'input_tokens': 587, 'output_tokens': 13, 'total_tokens': 600, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [21]:
rag_chain.invoke("What is the cover limit in case of rental car after an accident?")

AIMessage(content='The cover limit for a hire car after a not-at-fault car accident is that it will be provided until repairs authorized by the insurer are completed, until the reasonable costs to repair the car are paid, or until the claim is paid after the car has been assessed as a total loss. There is no specific dollar amount mentioned as a limit; instead, it is based on the duration of the necessary coverage as outlined.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 84, 'prompt_tokens': 1138, 'total_tokens': 1222, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_92e0377081', 'id': 'chatcmpl-Babdn8CktFvV1DVCgtFB660yJJTWR', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--57d5492e-85e

In [22]:
rag_chain.invoke("Is cover is provided for baby capsules, and is there a dollar limit")

AIMessage(content='Yes, cover is provided for baby capsules and child seats that are damaged or stolen when they are inside your car. However, there is no dollar limit mentioned for the replacement of these items. The coverage applies as long as they are not stolen from your car when the car itself isn’t stolen.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 59, 'prompt_tokens': 686, 'total_tokens': 745, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-Babdqh5Q78rv3PIh2CQMqgnSv7e5s', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--4b20a9bf-8f09-484c-a41f-f205c6d03c6e-0', usage_metadata={'input_tokens': 686, 'output_tokens': 59, 'total_tokens': 745, 'input_toke

In [23]:
rag_chain.invoke("What is the maximum amount of cover for legal liability?")

AIMessage(content='The maximum amount of cover for legal liability claims arising from any one incident is $30,000,000.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 22, 'prompt_tokens': 835, 'total_tokens': 857, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Babdscd4aQ0AE8pWDeEAczzoVQT4l', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--797d2a53-61bf-49f1-8f12-266a1f4dc51d-0', usage_metadata={'input_tokens': 835, 'output_tokens': 22, 'total_tokens': 857, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Generate structured responses

In [24]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

class ExtractedInfoWithReasoning(BaseModel):
    """Extracted information about the cover item"""
    summary_details: AnswerWithSources
    dollar_limit: AnswerWithSources
    conditions: AnswerWithSources
    exclusions: AnswerWithSources

class ExtractedInfo(BaseModel):
    """Extracted information about the cover item"""
    summary: str = Field(description="Answer to question")
    dollar_limit: str = Field(description="Answer to question")
    conditions: str = Field(description="Answer to question")
    exclusions: str = Field(description="Answer to question")

In [25]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfoWithReasoning, strict=True)
        )

rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to personal items cover.")

ExtractedInfoWithReasoning(summary_details=AnswerWithSources(answer='The policy covers personal items inside the car that are damaged in an incident or stolen with the car. This includes items like clothes and medical devices, up to a total of $1,000.', sources='Personal items pay up to $1,000 in total for personal items inside your car which are: • damaged in an incident; or • stolen along with your car.', reasoning='The summary emphasizes what the policy covers regarding personal items, mentioning both situations in which items could be covered.'), dollar_limit=AnswerWithSources(answer='$1,000 total for personal items inside the car.', sources='pay up to $1,000 in total for personal items inside your car which are: • damaged in an incident; or • stolen along with your car.', reasoning='The dollar limit is explicitly stated in the text, confirming the maximum payout.'), conditions=AnswerWithSources(answer='Personal items must be damaged in an incident or stolen along with the car; the

In [27]:
rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to the provision of a rental car after an accident.")

ExtractedInfoWithReasoning(summary_details=AnswerWithSources(answer='If you have a not-at-fault car accident, you will receive a hire car until repairs are completed or your claim is settled. This applies to theft or attempted theft cases as well, for up to 14 days.', sources='Standard Feature  We will...  But not...\nHire car after a  \nnot-at-fault car \naccident\narrange and pay the \nreasonable daily cost of a \nsuitable hire car, if your car \nis damaged in a not-at-fault \ncar accident. \nIt will be provided: \n • until repairs authorised \nby us are completed; or\n • until we pay the \nreasonable costs to \nrepair your car; or\n • until we pay your claim \nafter your car has been \nassessed as a total loss.\n\nHire car after theft  \nor attempted theft\narrange and pay the \nreasonable daily cost of a \nsuitable hire car, if your car \nis stolen or damaged in an \nattempted theft.\nIt will be provided for  \nup to 14 days: \n • until your car is found \nand doesn’t need repairs;

In [28]:
rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to the provision of a rental car after my car is stolen.")

ExtractedInfoWithReasoning(summary_details=AnswerWithSources(answer='If your car is stolen or damaged in an attempted theft, we will arrange and pay the reasonable daily cost of a suitable hire car for up to 14 days under the standard features of your policy. This benefit is provided until your car is found and doesn’t need repairs, until repairs are completed, until we pay the reasonable costs to repair your car, or until we pay your claim after your car is assessed as a total loss.', sources='arrange and pay the reasonable daily cost of a suitable hire car, if your car is stolen or damaged in an attempted theft. It will be provided for up to 14 days: • until your car is found and doesn’t need repairs; or • until repairs authorised by us are completed; or • until we pay the reasonable costs to repair your car; or • until we pay your claim after your car has been assessed as a total loss.', reasoning='The summary is derived from the provision that clearly states coverage for hire car e

In [29]:
rag_chain.invoke("Does this policy provide roadside assistance?")

ExtractedInfoWithReasoning(summary_details=AnswerWithSources(answer='No, this policy does not provide roadside assistance.', sources='What is comprehensive cover?', reasoning='The retrieved context includes details about standard features like travel expenses, emergency accommodation costs, and towing/storage costs, but there is no mention of roadside assistance being included in the policy.'), dollar_limit=AnswerWithSources(answer='The policy reimburses up to $100 for travel expenses and up to $1,000 for emergency accommodation and transport costs.', sources='Standard Feature We will... But not...', reasoning='The specific reimbursement limits for travel expenses and emergency accommodation costs are clearly outlined in the context.'), conditions=AnswerWithSources(answer='Reimbursement is conditional on your car being safe to drive and applicable only if the incident occurs less than 100km from home for emergency accommodation.', sources='Standard Feature We will... But not...', reaso

### Transform response into a dataframe

In [30]:
structured_response = rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to personal items cover.")
df = pd.DataFrame([structured_response.model_dump()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

Unnamed: 0,summary_details,dollar_limit,conditions,exclusions
answer,The personal items cover allows for payment up...,The dollar limit for personal items cover is $...,Conditions state that personal items must be e...,"Exclusions include cash, cheques, credit cards..."
source,"Personal items pay up to $1,000 in total for p...","pay up to $1,000 in total for personal items i...",The following Standard Features apply when you...,"any of the following items: • cash, cheques, c..."
reasoning,This summary captures the essence of the perso...,"The dollar limit of $1,000 indicates the maxim...",These conditions specify that the coverage is ...,These exclusions clarify what items are not co...
