# Installation and Imports

In [None]:
!pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m35.9 MB/s[0m eta [36m

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-n

In [None]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
from sentence_transformers import CrossEncoder, util
import json
import tiktoken
import openai
import chromadb
from chromadb.config import Settings
import shutil
from google.colab import files

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
from sentence_transformers import SentenceTransformer, util
# Import the SentenceTransformer library
from sentence_transformers import SentenceTransformer

# Data Loading and Storing

In [None]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [None]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([pdf_path,page_no, " ".join(lines)])
            p +=1

    return full_text

## Policy Data Loading, Cleaning and Chunking

In [None]:
path="inv-pol.pdf"

In [None]:
# Open the PDF file
with pdfplumber.open(path) as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[1]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

    # Print the extracted text
    print(text)

Clause 1. Objective
This document outlines the internal expectations for invoice accuracy, vendor verification,
and supporting documentation standards across all units engaged in procurement and
payment activities. The intent is to strengthen safeguards against payment fraud, ensure
compliance with financial reporting obligations, and enhance audit preparedness throughout
the procure-to-pay cycle.
Clause 2. Applicability
The policies herein apply to:
 All invoices received for payment, reimbursement, or accrual, irrespective of size or
origin.
 Vendors providing software, hardware, services, or general office supplies.
 Any department or individual involved in the submission, review, approval, or
reconciliation of invoices.
These provisions are binding across all subsidiaries, project offices, and regional operations.


In [None]:
path="inv-pol.pdf"
data=[]
#print(f"...Processing {path}")

# Call the function to extract the text from the PDF
extracted_text = extract_text_from_pdf(path)

# Convert the extracted list to a daatframe, and add a column to store document names
extracted_text_df = pd.DataFrame(extracted_text, columns=['Invoice','Page No.', 'Page_Text'])

# Append the extracted text and document name to the list
data.append(extracted_text_df)

# # Print a message to indicate progress
# print(f"Finished processing {path}")

# # Print a message to indicate all PDFs have been processed
# print("PDF have been processed.")

In [None]:
policy_pdfs_data = pd.concat(data, ignore_index=True)
policy_pdfs_data

Unnamed: 0,Invoice,Page No.,Page_Text
0,inv-pol.pdf,Page 1,Internal Policy Document Title: Invoice Integr...
1,inv-pol.pdf,Page 2,Clause 1. Objective This document outlines the...
2,inv-pol.pdf,Page 3,Clause 3. Document Quality & Image Legibility ...
3,inv-pol.pdf,Page 4,Clause 5. Taxation & Regulatory Adherence All ...
4,inv-pol.pdf,Page 5,Clause 7. Handling of Irregular Submissions In...
5,inv-pol.pdf,Page 6,Clause 9. Review & Governance This policy is r...


In [None]:
policy_pdfs_data["Page_Text"][0]

'Internal Policy Document Title: Invoice Integrity, Vendor Validation & Financial Controls Issued By: Office of Financial Controls & Audit Oversight Effective Date: July 1, 2025 Applies To: Finance Operations, Accounts Payable, Procurement, and Vendor Management Version: 1.0'

In [None]:
len(policy_pdfs_data["Page_Text"][0])

274

In [None]:
policy_pdfs_data["Page_Text"][1]

'Clause 1. Objective This document outlines the internal expectations for invoice accuracy, vendor verification, and supporting documentation standards across all units engaged in procurement and payment activities. The intent is to strengthen safeguards against payment fraud, ensure compliance with financial reporting obligations, and enhance audit preparedness throughout the procure-to-pay cycle. Clause 2. Applicability The policies herein apply to: \uf0b7 All invoices received for payment, reimbursement, or accrual, irrespective of size or origin. \uf0b7 Vendors providing software, hardware, services, or general office supplies. \uf0b7 Any department or individual involved in the submission, review, approval, or reconciliation of invoices. These provisions are binding across all subsidiaries, project offices, and regional operations.'

In [None]:
policy_pdfs_data["Page_Text"][3]

'Clause 5. Taxation & Regulatory Adherence All invoices must present accurate and complete tax information, aligned with both jurisdictional regulations and internal compliance standards. \uf0b7 The tax amount should be clearly itemized or determinable from the percentage rate provided. \uf0b7 Invoices must include a valid tax identification number where required. The format of this identifier will vary depending on national or regional tax authorities and should correspond to the relevant statutory requirements. Invoices lacking adequate tax details may be referred to the Tax Governance Group for further review. Regional offices are advised to engage with local tax specialists when handling transactions with cross-border tax implications. Clause 6. Purchase Detail Integrity Clear and consistent itemization on invoices facilitates accurate reconciliation and supports the identification of atypical or potentially irregular activity. \uf0b7 All invoices should include a clearly labeled q

In [None]:
def normalize_bullets(text):
    return text.replace('\uf0b7', '\n- ')

In [None]:
policy_pdfs_data["Page_Text"]=policy_pdfs_data["Page_Text"].apply(normalize_bullets)

In [None]:
policy_pdfs_data["Page_Text"][1]

'Clause 1. Objective This document outlines the internal expectations for invoice accuracy, vendor verification, and supporting documentation standards across all units engaged in procurement and payment activities. The intent is to strengthen safeguards against payment fraud, ensure compliance with financial reporting obligations, and enhance audit preparedness throughout the procure-to-pay cycle. Clause 2. Applicability The policies herein apply to: \n-  All invoices received for payment, reimbursement, or accrual, irrespective of size or origin. \n-  Vendors providing software, hardware, services, or general office supplies. \n-  Any department or individual involved in the submission, review, approval, or reconciliation of invoices. These provisions are binding across all subsidiaries, project offices, and regional operations.'

In [None]:
import re

def insert_clause_breaks(text):
    # Insert three newlines before each "Clause X." (except if it's at the start)
    updated_text = re.sub(r'(?<!^)(Clause \d+\.)', r'\n\n\n\1', text)
    return updated_text


In [None]:
policy_pdfs_data["Page_Text"]=policy_pdfs_data["Page_Text"].apply(insert_clause_breaks)

In [None]:
policy_pdfs_data["Page_Text"][1]

'Clause 1. Objective This document outlines the internal expectations for invoice accuracy, vendor verification, and supporting documentation standards across all units engaged in procurement and payment activities. The intent is to strengthen safeguards against payment fraud, ensure compliance with financial reporting obligations, and enhance audit preparedness throughout the procure-to-pay cycle. \n\n\nClause 2. Applicability The policies herein apply to: \n-  All invoices received for payment, reimbursement, or accrual, irrespective of size or origin. \n-  Vendors providing software, hardware, services, or general office supplies. \n-  Any department or individual involved in the submission, review, approval, or reconciliation of invoices. These provisions are binding across all subsidiaries, project offices, and regional operations.'

In [None]:
print(policy_pdfs_data["Page_Text"][1])

Clause 1. Objective This document outlines the internal expectations for invoice accuracy, vendor verification, and supporting documentation standards across all units engaged in procurement and payment activities. The intent is to strengthen safeguards against payment fraud, ensure compliance with financial reporting obligations, and enhance audit preparedness throughout the procure-to-pay cycle. 


Clause 2. Applicability The policies herein apply to: 
-  All invoices received for payment, reimbursement, or accrual, irrespective of size or origin. 
-  Vendors providing software, hardware, services, or general office supplies. 
-  Any department or individual involved in the submission, review, approval, or reconciliation of invoices. These provisions are binding across all subsidiaries, project offices, and regional operations.


In [None]:
print(policy_pdfs_data["Page_Text"][2])

Clause 3. Document Quality & Image Legibility Invoices must be submitted in a format that preserves clarity, completeness, and visibility of all essential elements. Clear documentation is foundational to payment approval, recordkeeping, and audit readiness. 
-  Invoice files—whether scanned or digitally generated—should display all content without distortion, cropping, or quality loss that could interfere with interpretation. 
-  Files where text is blurred, fields are obscured, or key sections appear cut off may be returned for clarification or reissuance. 
-  While minor imperfections are understood, repeated submissions with unreadable or truncated documents may trigger procedural review or intervention by Records Governance. Where document fidelity is in doubt, Finance Operations may consult Internal Audit or escalate to relevant oversight teams. 


Clause 4. Vendor Identity & Beneficiary Validation Accurate vendor identification is critical to ensure payments are made exclusively 

The documents were chunked by inserting a unique delimiter ("\n\n\n") before each clause header to separate the text into meaningful sections. Chunking was performed using a simple split on this delimiter, ensuring each chunk corresponded to a complete clause. Metadata including page number, chunk sequence, and clause number was generated for each chunk to aid retrieval. While more advanced chunking methods exist, this approach was chosen for its simplicity and alignment with the document’s natural structure.



In [None]:
def extract_clause_number(text):
    match = re.search(r'Clause (\d+)\.', text)
    return int(match.group(1)) if match else None

def simple_chunk_page(row):
    page_no = row['Page No.']
    page_text = row['Page_Text']

    # Split on the delimiter
    clauses = page_text.split('\n\n\n')

    rows = []
    for i, clause in enumerate(clauses,start=1):
        metadata = {
            'Page_No': page_no,
            'Chunk_No': i,
            'Clause_No': extract_clause_number(clause) or i
        }
        rows.append({'Page No.':page_no,'Page_Text':page_text,'Chunked_Text': clause.strip(), 'Metadata': metadata})
    return rows






In [None]:
all_chunks = []
for _, row in policy_pdfs_data.iterrows():
    all_chunks.extend(simple_chunk_page(row))

chunked_policy = pd.DataFrame(all_chunks)

In [None]:
chunked_policy

Unnamed: 0,Page No.,Page_Text,Chunked_Text,Metadata
0,Page 1,Internal Policy Document Title: Invoice Integr...,Internal Policy Document Title: Invoice Integr...,"{'Page_No': 'Page 1', 'Chunk_No': 1, 'Clause_N..."
1,Page 2,Clause 1. Objective This document outlines the...,Clause 1. Objective This document outlines the...,"{'Page_No': 'Page 2', 'Chunk_No': 1, 'Clause_N..."
2,Page 2,Clause 1. Objective This document outlines the...,Clause 2. Applicability The policies herein ap...,"{'Page_No': 'Page 2', 'Chunk_No': 2, 'Clause_N..."
3,Page 3,Clause 3. Document Quality & Image Legibility ...,Clause 3. Document Quality & Image Legibility ...,"{'Page_No': 'Page 3', 'Chunk_No': 1, 'Clause_N..."
4,Page 3,Clause 3. Document Quality & Image Legibility ...,Clause 4. Vendor Identity & Beneficiary Valida...,"{'Page_No': 'Page 3', 'Chunk_No': 2, 'Clause_N..."
5,Page 4,Clause 5. Taxation & Regulatory Adherence All ...,Clause 5. Taxation & Regulatory Adherence All ...,"{'Page_No': 'Page 4', 'Chunk_No': 1, 'Clause_N..."
6,Page 4,Clause 5. Taxation & Regulatory Adherence All ...,Clause 6. Purchase Detail Integrity Clear and ...,"{'Page_No': 'Page 4', 'Chunk_No': 2, 'Clause_N..."
7,Page 5,Clause 7. Handling of Irregular Submissions In...,Clause 7. Handling of Irregular Submissions In...,"{'Page_No': 'Page 5', 'Chunk_No': 1, 'Clause_N..."
8,Page 5,Clause 7. Handling of Irregular Submissions In...,Clause 8. Recommended Practices To reduce paym...,"{'Page_No': 'Page 5', 'Chunk_No': 2, 'Clause_N..."
9,Page 6,Clause 9. Review & Governance This policy is r...,Clause 9. Review & Governance This policy is r...,"{'Page_No': 'Page 6', 'Chunk_No': 1, 'Clause_N..."


In [None]:
chunked_policy['Metadata'][0]

{'Page_No': 'Page 1', 'Chunk_No': 1, 'Clause_No': 1}

In [None]:
chunked_policy['Metadata'][1]

{'Page_No': 'Page 2', 'Chunk_No': 1, 'Clause_No': 1}

In [None]:
chunked_policy['Metadata'][4]

{'Page_No': 'Page 3', 'Chunk_No': 2, 'Clause_No': 4}

In [None]:
chunked_policy['Metadata']

Unnamed: 0,Metadata
0,"{'Page_No': 'Page 1', 'Chunk_No': 1, 'Clause_N..."
1,"{'Page_No': 'Page 2', 'Chunk_No': 1, 'Clause_N..."
2,"{'Page_No': 'Page 2', 'Chunk_No': 2, 'Clause_N..."
3,"{'Page_No': 'Page 3', 'Chunk_No': 1, 'Clause_N..."
4,"{'Page_No': 'Page 3', 'Chunk_No': 2, 'Clause_N..."
5,"{'Page_No': 'Page 4', 'Chunk_No': 1, 'Clause_N..."
6,"{'Page_No': 'Page 4', 'Chunk_No': 2, 'Clause_N..."
7,"{'Page_No': 'Page 5', 'Chunk_No': 1, 'Clause_N..."
8,"{'Page_No': 'Page 5', 'Chunk_No': 2, 'Clause_N..."
9,"{'Page_No': 'Page 6', 'Chunk_No': 1, 'Clause_N..."


## Policy Data Embedding

In [None]:


embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

persist_directory = "./policy_store"

# Specify the collection name you want to create/use
collection_name = "policy_collection"

# Initialize Chroma with a specific collection name
vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings,
    collection_name=collection_name
)




  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vectorstore = Chroma(


In [None]:
chunked_texts = chunked_policy['Chunked_Text'].tolist()
metadata = chunked_policy['Metadata'].tolist()
ids = chunked_policy.index.tolist()

In [None]:
ids = [str(i) for i in ids]

In [None]:
metadata

[{'Page_No': 'Page 1', 'Chunk_No': 1, 'Clause_No': 1},
 {'Page_No': 'Page 2', 'Chunk_No': 1, 'Clause_No': 1},
 {'Page_No': 'Page 2', 'Chunk_No': 2, 'Clause_No': 2},
 {'Page_No': 'Page 3', 'Chunk_No': 1, 'Clause_No': 3},
 {'Page_No': 'Page 3', 'Chunk_No': 2, 'Clause_No': 4},
 {'Page_No': 'Page 4', 'Chunk_No': 1, 'Clause_No': 5},
 {'Page_No': 'Page 4', 'Chunk_No': 2, 'Clause_No': 6},
 {'Page_No': 'Page 5', 'Chunk_No': 1, 'Clause_No': 7},
 {'Page_No': 'Page 5', 'Chunk_No': 2, 'Clause_No': 8},
 {'Page_No': 'Page 6', 'Chunk_No': 1, 'Clause_No': 9}]

In [None]:
# Add texts with metadata and ids to this specific collection
vectorstore.add_texts(
    texts=chunked_texts,
    metadatas=metadata,
    ids=ids
)

# Save to disk for persistence
vectorstore.persist()

print(f"Data added to Chroma collection: {collection_name}")

Data added to Chroma collection: policy_collection


  vectorstore.persist()


In [None]:
shutil.make_archive("policy_store", 'zip', "policy_store")

# Download the zip file
files.download("policy_store.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls -l ./policy_store

total 292
drwxr-xr-x 2 root root   4096 Jul 27 08:25 8f819a0e-86ad-4122-92b7-8bf3a2c15fdd
-rw-r--r-- 1 root root 294912 Jul 27 08:25 chroma.sqlite3
