# Retrieval-augmented generative AI pipeline
### to provide structured answeres to questions about documents

specifically, Insurance product disclosure statements

Author: AJ Duncanson, heavily borrowing from Thu Vu, see readme.

## 0. Setting up

In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb langchain_chroma
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [None]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

#from langchain.vectorstores import Chroma
from langchain_chroma import Chroma


from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

### Define our LLM

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

## 1. Process PDF document

### Load PDF document

In [None]:
loader = PyPDFLoader("/Users/aj.duncanson/projects/structured-rag-pdf/data/QM8505-1123 QBE Comprehensive Car Insurance PDS (web).pdf")
pages = loader.load()
pages

### Split document

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Create embeddings

In [None]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

In [None]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

In [None]:
evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

### Create vector database

In [None]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)
    
    return vectorstore

In [None]:
ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

# Ensure that only unique docs with unique ids are kept
unique_ids = set()
unique_chunks = []

unique_chunks = [] 
for chunk, id in zip(chunks, ids):     
    if id not in unique_ids:       
        unique_ids.add(id)
        unique_chunks.append(chunk) 

In [None]:
unique_chunks

In [None]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_chroma")

## 2. Query for relevant data

In [None]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [None]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("Who underwrites this policy")
relevant_chunks

In [None]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [None]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="Who underwrites this policy?")
print(prompt)

In [None]:
llm.invoke(prompt)

### Using Langchain Expression Language

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the name of the insurance product?")

In [None]:
rag_chain.invoke("What is the cover limit in case of rental car after an accident?")

In [None]:
rag_chain.invoke("Is cover is provided for baby capsules, and is there a dollar limit")

In [None]:
rag_chain.invoke("What is the maximum amount of cover for legal liability?")

## 4. Generate structured responses

### Still exploring the structure that's most useful for our needs, and also using very early draft prompts.

In [None]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

class ExtractedInfoWithReasoning(BaseModel):
    """Extracted information about the cover item"""
    summary_details: AnswerWithSources
    dollar_limit: AnswerWithSources
    conditions: AnswerWithSources
    exclusions: AnswerWithSources

class ExtractedInfo(BaseModel):
    """Extracted information about the cover item"""
    summary: str = Field(description="Answer to question")
    dollar_limit: str = Field(description="Answer to question")
    conditions: str = Field(description="Answer to question")
    exclusions: str = Field(description="Answer to question")

In [None]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfoWithReasoning, strict=True)
        )

rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to personal items cover.")

In [None]:
rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to the provision of a rental car after an accident.")

In [None]:
rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to the provision of a rental car after my car is stolen.")

In [None]:
rag_chain.invoke("Does this policy provide roadside assistance?")

### Transform response into a dataframe

In [None]:
structured_response = rag_chain.invoke("Give me a summary, dollar limit, specific conditions and specific exclusions relating to personal items cover.")
df = pd.DataFrame([structured_response.model_dump()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df