# Problem statement

# Chunking a PDF document

In [1]:
import pdfplumber
from IPython.display import JSON

In [2]:
pdf_document_file_path = "../Life Insurance Policy Sample.pdf"

In [3]:
def is_useful_page(text):
    return "Section" in text.strip().splitlines()[-1]

def get_part(text):
    return text.strip().splitlines()[-2]

def get_section(text):
    last_line = text.strip().splitlines()[-1]
    section_with_page_number = "Section " + last_line.split("Section")[1]
    return section_with_page_number.split("Page")[0].strip().split(",")[0].strip()

In [4]:
data = {}

with pdfplumber.open(pdf_document_file_path) as pdf:
    for page in pdf.pages:
        full_text = page.extract_text()
        if is_useful_page(full_text):
            part                = get_part(full_text)
            section             = get_section(full_text)

            if part not in data:
                data[part] = {}
            data_part = data[part]

            if section not in data_part:
                data_part[section] = []
            data_part_section = data_part[section]

            data_part_section.extend(full_text.splitlines())



JSON(data)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

<IPython.core.display.JSON object>

In [5]:
documents = []

def is_not_same(left, right):
    return left.replace(" ", "").lower() not in right.replace(" ", "").lower()

for part,part_detail in data.items():
    for section, section_detail in part_detail.items():
        article = ""
        article.replace(" ", "").lower()
        content = ""
        for line in section_detail:
            if is_not_same(part, line) and is_not_same(section, line) and is_not_same("This policy has been updated effective", line):
                if "Article " in line and " - " in line:
                    if article and content:
                        documents.append({
                            "part": part,
                            "section": section,
                            "article": article,
                            "content": content
                        })
                    article = line
                    content = ""
                    continue
                else:
                    content += line + " "
            else:
                continue

        # Append the last article and content
        if article and content:
            documents.append({
                "part": part,
                "section": section,
                "article": article,
                "content": content
            })


In [6]:
JSON(documents)

<IPython.core.display.JSON object>

# Persisting in Qdrant DB

In [7]:
from sentence_transformers import SentenceTransformer

# now we need a model to generate embeddings
sentence_embedding_model_name = "all-MiniLM-L6-v2"
sentence_embedding_model = SentenceTransformer(sentence_embedding_model_name)

In [8]:
points = [{
    "id" : i,
    "vector": sentence_embedding_model.encode(document["content"]),
    "payload": {
        "part": document["part"],
        "section": document["section"],
        "article": document["article"],
        "content": document["content"],
        "text_length": len(document["content"])
    }
} for i,document in enumerate(documents)]

In [9]:
from qdrant_client import QdrantClient
client = QdrantClient(
    host = 'localhost',
    port = 6333
)
client.delete_collection(
    collection_name = "life_insurance_policy_documents"
)
client.create_collection(
    collection_name = "life_insurance_policy_documents",
    vectors_config = {
            "size": 384,
            "distance": "Cosine"
    }
)
client.upsert(
    collection_name = "life_insurance_policy_documents",
    points = points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [31]:
# If the same query comes in, we do not do a semantic search, we look up for exact same query and return the results.
# create collection
client.delete_collection(
    collection_name = "life_insurance_policy_documents_cache"
)
client.create_collection(
    collection_name = "life_insurance_policy_documents_cache",
    vectors_config = {
            "size": 384,
            "distance": "Cosine"
    }
)

True

In [20]:
# what are the different searching options in qdrant db?
from hashlib import sha256
import json

def hash_query(query: str) -> str:
    return sha256(query.encode()).hexdigest()

def save_to_cache(query: str, points: list) -> None:
    query_embedding = sentence_embedding_model.encode(query)
    query_hash = hash_query(query)
    client.upsert(
        collection_name = "life_insurance_policy_documents_cache",
        points = [{
            "id": int(query_hash[:16], 16),
            "vector": query_embedding,
            "payload": {
                "query": query,
                "query_hash": query_hash,
                "points": points
            }
        }]
    )

def query_cache_collection(query: str) -> list:
    query_hash = hash_query(query)
    query_response = client.query_points(
        collection_name = "life_insurance_policy_documents_cache",
        query_filter = {
            "must" : [
                {"key": "query_hash", "match": {"value": query_hash}},
            ]
        },
        limit = 5,
        with_payload = True
    )
    points = []
    if len(query_response.points) > 0:
        points = query_response.points[0].payload['points']
    return points

def query_collection(query: str) -> list:
    query_response = client.query_points(
        collection_name = "life_insurance_policy_documents",
        query = sentence_embedding_model.encode(query),
        limit = 5,
        with_payload = True
    )
    results = []
    [results.append({
        "id" : point.id,
        "version": point.version,
        "score": point.score,
        "part": point.payload["part"],
        "section": point.payload["section"],
        "article": point.payload["article"],
        "content": point.payload["content"],
        "text_length": point.payload["text_length"]
    }) for point in query_response.points]
    return results

def search(query: str) -> list:
    points = query_cache_collection(query)
    if len(points) == 0:
        points = query_collection(query)
        save_to_cache(query, points)
        print("Cache miss!")
    else:
        print("Cache hit!")
    return points

In [27]:
# convert query_response.points to a dataframe

import pandas as pd

def get_df_from_points(vector_points):
    flat_items = []
    [flat_items.append({
        "id" : point["id"],
        "version": point["version"],
        "score": point["score"],
        "part": point["part"],
        "section": point["section"],
        "article": point["article"],
        "content": point["content"],
        "text_length": point["text_length"]
    }) for point in vector_points]

    return pd.DataFrame(flat_items)

In [36]:
query = "who has the authority to change the policy?"
results_vector_points = search(query)
get_df_from_points(results_vector_points)

Cache hit!


Unnamed: 0,id,version,score,part,section,article,content,text_length
0,1,0,0.593908,PART II - POLICY ADMINISTRATION,Section A - Contract,Article 2 - Policy Changes,Insurance under this Group Policy runs annuall...,1308
1,9,0,0.476504,PART II - POLICY ADMINISTRATION,Section A - Contract,Article 10 - Policy Interpretation,T he Principal has complete discretion to cons...,434
2,5,0,0.465559,PART II - POLICY ADMINISTRATION,Section A - Contract,Article 6 - Information to be Furnished,"The Policyholder must, upon request, give The ...",491
3,0,0,0.43676,PART II - POLICY ADMINISTRATION,Section A - Contract,Article 1 - Entire Contract,"This Group Policy, the current Certificate, th...",281
4,13,0,0.413617,PART II - POLICY ADMINISTRATION,Section B - Premiums,Article 3 - Premium Rate Changes,The Principal may change a premium rate: a. on...,1816


# Re-Ranking

In [37]:
from sentence_transformers import CrossEncoder

In [38]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [39]:
results_df['re_ranking_scores'] = results_df.apply(lambda x: cross_encoder.predict([[query, x['content']]]), axis = 1)

In [16]:
results_df = results_df.sort_values(by = ['re_ranking_scores'], ascending = False)
top_3_RAG = results_df[:3][[
    'part',
    'section',
    'article',
    'content'
]]
top_3_RAG

Unnamed: 0,part,section,article,content
0,PART II - POLICY ADMINISTRATION,Section A - Contract,Article 2 - Policy Changes,Insurance under this Group Policy runs annuall...
4,PART II - POLICY ADMINISTRATION,Section B - Premiums,Article 3 - Premium Rate Changes,The Principal may change a premium rate: a. on...
2,PART II - POLICY ADMINISTRATION,Section A - Contract,Article 6 - Information to be Furnished,"The Policyholder must, upon request, give The ..."


# Generation

In [17]:
import openai

with open('../OPENAI_API_KEY.txt', 'r') as openai_key_file:
    openai.api_key = openai_key_file.readline()

In [18]:
def get_chat_model_completions(messages):
    response_llm = openai.chat.completions.create(
        model = 'gpt-4o-mini',
        temperature = 0,
        messages = messages
    )
    return response_llm.choices[0].message.content

In [19]:
 def get_insurance_answers(question, relevant_context):
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{question}' and you have some search results from a corpus of insurance documents in the dataframe '{relevant_context}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{relevant_context}' to answer the query '{question}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]
    return get_chat_model_completions(messages)



In [20]:
rag_response = get_insurance_answers(query, top_3_RAG)
print(rag_response)

The authority to change the policy typically lies with the Principal or the Policyholder, depending on the specific changes being made. According to the insurance documents:

1. **Policy Changes**: The Principal has the authority to make changes to the policy as outlined in Article 2 - Policy Changes.
2. **Premium Rate Changes**: The Principal can also change the premium rates as specified in Article 3 - Premium Rate Changes.
3. **Information Requirements**: The Policyholder is required to furnish information upon request, which may also influence policy changes as mentioned in Article 6 - Information to be Furnished.

Here is a summary of the relevant articles:

| Article                          | Authority                     | Description                                      |
|----------------------------------|-------------------------------|--------------------------------------------------|
| Article 2 - Policy Changes       | Principal                     | Authority to change