# Chunking a PDF document

In [1]:
import pdfplumber
from IPython.display import JSON

In [2]:
pdf_document_file_path = "../Life Insurance Policy Sample.pdf"

In [3]:
def is_useful_page(text):
    return "Section" in text.strip().splitlines()[-1]

def get_part(text):
    return text.strip().splitlines()[-2]

def get_section(text):
    last_line = text.strip().splitlines()[-1]
    section_with_page_number = "Section " + last_line.split("Section")[1]
    return section_with_page_number.split("Page")[0].strip().split(",")[0].strip()

In [None]:
data = {}

with pdfplumber.open(pdf_document_file_path) as pdf:
    for page in pdf.pages:
        full_text = page.extract_text()
        if is_useful_page(full_text):
            part                = get_part(full_text)
            section             = get_section(full_text)

            if part not in data:
                data[part] = {}
            data_part = data[part]

            if section not in data_part:
                data_part[section] = []
            data_part_section = data_part[section]

            data_part_section.extend(full_text.splitlines())



JSON(data)

In [24]:
documents = []

def is_not_same(left, right):
    return left.replace(" ", "").lower() not in right.replace(" ", "").lower()

for part,part_detail in data.items():
    for section, section_detail in part_detail.items():
        article = ""
        article.replace(" ", "").lower()
        content = ""
        for line in section_detail:
            if is_not_same(part, line) and is_not_same(section, line) and is_not_same("This policy has been updated effective", line):
                if "Article " in line and " - " in line:
                    if article and content:
                        documents.append({
                            "part": part,
                            "section": section,
                            "article": article,
                            "content": content
                        })
                    article = line
                    content = ""
                    continue
                else:
                    content += line + " "
            else:
                continue

        # Append the last article and content
        if article and content:
            documents.append({
                "part": part,
                "section": section,
                "article": article,
                "content": content
            })


In [25]:
JSON(documents)

<IPython.core.display.JSON object>

# Persisting in Qdrant DB

In [27]:
from sentence_transformers import SentenceTransformer

# now we need a model to generate embeddings
sentence_embedding_model_name = "all-MiniLM-L6-v2"
sentence_embedding_model = SentenceTransformer(sentence_embedding_model_name)

In [35]:
points = [{
    "id" : i,
    "vector": sentence_embedding_model.encode(document["content"]),
    "payload": {
        "part": document["part"],
        "section": document["section"],
        "article": document["article"],
        "content": document["content"],
        "text_length": len(document["content"])
    }
} for i,document in enumerate(documents)]

In [37]:
from qdrant_client import QdrantClient
client = QdrantClient(
    host = 'localhost',
    port = 6333
)
client.delete_collection(
    collection_name = "life_insurance_policy_documents"
)
client.create_collection(
    collection_name = "life_insurance_policy_documents",
    vectors_config = {
            "size": 384,
            "distance": "Cosine"
    }
)
client.upsert(
    collection_name = "life_insurance_policy_documents",
    points = points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [41]:
query_response = client.query_points(
    collection_name = "life_insurance_policy_documents",
    query = sentence_embedding_model.encode("who has the authority to change the policy?"),
    limit = 5,
    with_payload = True)

In [42]:
query_response

QueryResponse(points=[ScoredPoint(id=1, version=0, score=0.5939084, payload={'part': 'PART II - POLICY ADMINISTRATION', 'section': 'Section  A - Contract', 'article': 'Article 2 - Policy Changes', 'content': "Insurance under this Group Policy runs annually to the Policy Anniversary, unless sooner terminated. No agent, employee, or person other than an officer of The Principal has authority to change this Group Policy, and, to be effective, all such changes must be in Writing and Signed by an officer of The Principal. The Principal reserves the right to change this Group Policy as follows: a. Any or all provisions of this Group Policy may be amended or changed at any time, including retroactive changes, to the extent necessary to meet the requirements of any law or any regulation issued by any governmental agency to which this Group Policy is subject. b. Any or all provisions of this Group Policy may be amended or changed at any time when The Principal determines that such amendment is 