In [1]:
# imports
import numpy as np
import pandas as pd
import os
from ast import literal_eval

# load data
datafile_path = "../../data/raw"

data = os.listdir(datafile_path)
data = [datafile_path + "/" + path for path in data]



In [10]:
from openai import OpenAI
from dotenv import load_dotenv

# Load variables from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = text, model=model).data[0].embedding

# matrix = np.vstack(df.embedding.values)
# matrix.shape

In [3]:
import openai
import time
import sys
sys.path.append('../')
from ocr.mistral import mistral_ocr
from mistralai import Mistral


path = "/Users/amarkanaka/repos/pageStreamSegmentation/2024.02.14 RELEASE 24-013 RELEASE_2nd Interim Production_pages 401-500.pdf"
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=MISTRAL_API_KEY)
pages = mistral_ocr(path, client)

In [5]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-4-turbo")
def num_tokens_by_tiktoken(text: str) -> int:
    return len(enc.encode(text))

for idx in range(len(pages)):
  num_tokens = num_tokens_by_tiktoken(pages[idx])
  num_tokens_left = num_tokens
  curr_num = 0
  insert_idx = idx
  text = pages[idx]
  while(num_tokens_left > 8192):
    curr_text = text[curr_num:curr_num + 8000]
    pages.insert(insert_idx, curr_text)
    curr_num += 8000
    num_tokens_left -= 8000
    insert_idx += 1

In [8]:
max_tokens = 0
for page in pages:
    num_tokens = num_tokens_by_tiktoken(page)
    if num_tokens > max_tokens:
        max_tokens = num_tokens
print(max_tokens)

1868


In [11]:
embeddings = [get_embedding(page) for page in pages]

In [14]:

# ic_similarity = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75]
# for sim in ic_similarity:

# Compute pairwise similarities
similarities = [np.dot(embeddings[i], embeddings[i+1]) for i in range(len(embeddings)-1)]
# Mark segment boundaries where similarity drops below threshold
threshold = 0.51  # This is just an example value; tune for your data
boundaries = [1 if sim < threshold else 0 for sim in similarities]

if boundaries[-1] == 0:
  boundaries.append(0)
else:
  boundaries.append(1)


from PyPDF2 import PdfReader, PdfWriter

# Path to your 100-page PDF
input_pdf_path = path

# Your split list (copy your list here)
split_flags = boundaries

# Check length matches the PDF
reader = PdfReader(input_pdf_path)
num_pages = len(reader.pages)
assert len(split_flags) == num_pages, "Length of split_flags must equal number of PDF pages!"

out_dir = "/Users/amarkanaka/repos/pageStreamSegmentation/data/itemized"
os.makedirs(out_dir, exist_ok=True)

start = 0
doc_num = 1

for i, flag in enumerate(split_flags):
    # If flag is 1, end the current document at this page
    if flag == 1:
        writer = PdfWriter()
        for j in range(start, i+1):  # Include the current page
            writer.add_page(reader.pages[j])
        output_path = os.path.join(out_dir, f"split_{doc_num:03d}.pdf")
        with open(output_path, "wb") as out_f:
            writer.write(out_f)
        doc_num += 1
        start = i + 1  # Start the next doc at the next page

# Optionally, if last split is not at end, you could add a check, but your list should cover all pages
print(f"Done! Wrote {doc_num-1} split PDFs to {out_dir}/")


Done! Wrote 47 split PDFs to /Users/amarkanaka/repos/pageStreamSegmentation/data/itemized/


Fine Tune Inter-Cluster Similarity Threshold on Validation Set and Benchmark Results on Test Set

In [8]:
# read val.txt line by line, creating clusters of ~200 pages, and do hyperparameter tuning
ic_similarity = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75]
val_folders = set()
current_doc = ""
append_path = "/Users/amarkanaka/repos/pageStreamSegmentation/"
with open(append_path + "data/val.txt") as f:
    page_count = 0
    val_folder = set()
    is_within_doc = False
    curr_doc = ""
    for line in f:
        next_doc = line.split()[1]
        if next_doc == "name":
            continue
        else:
            if curr_doc == next_doc:
                is_within_doc = True
            else:
                is_within_doc = False
            curr_doc = next_doc
            if page_count < 150 or is_within_doc:
                val_folder.add(curr_doc)
                page_count += 1
            else:
                val_folders.add(frozenset(val_folder))
                val_folder.clear()
                page_count = 0

In [9]:
val_folders

{frozenset({'data/raw/ghwb_0786.pdf',
            'data/raw/neal_0278.pdf',
            'data/raw/neal_0712.pdf',
            'data/raw/pcast0002.pdf',
            'data/raw/pcast0007.pdf',
            'data/raw/pcast0052.pdf'}),
 frozenset({'data/raw/ghwb_0196.pdf',
            'data/raw/ghwb_0522.pdf',
            'data/raw/ghwb_0564.pdf',
            'data/raw/ghwb_0608.pdf',
            'data/raw/ghwb_0611.pdf',
            'data/raw/ghwb_0732.pdf',
            'data/raw/ghwb_0937.pdf',
            'data/raw/ghwb_1157.pdf',
            'data/raw/neal_0289.pdf',
            'data/raw/neal_0312.pdf',
            'data/raw/neal_0344.pdf',
            'data/raw/pcast0003.pdf',
            'data/raw/pcast0058.pdf'}),
 frozenset({'data/raw/ghwb_0165.pdf',
            'data/raw/ghwb_0189.pdf',
            'data/raw/ghwb_0248.pdf',
            'data/raw/ghwb_0376.pdf',
            'data/raw/ghwb_0383.pdf',
            'data/raw/ghwb_0408.pdf',
            'data/raw/ghwb_0425.pdf',
        