In [1]:
import poppler

In [2]:
doc = poppler.load_from_file("/data/Basic Rules (2014).pdf")

In [None]:
doc.infos()

In [None]:
font_iterator = doc.create_font_iterator()
for page, fonts in font_iterator:
    print(f"Fonts for page {page}")
    for font in fonts:
        print(f"- {font.name}")

In [None]:
import pandas as pd

data = [
    (p, i, b.text, b.get_font_name(), b.get_font_size())
      for p in range(doc.pages)
      for i, b in enumerate(doc.create_page(p).text_list(doc.create_page(p).TextListOption.text_list_include_font))
]
df = pd.DataFrame(data, columns=["page", "word_in_page", "text", "font", "size"])
df

In [None]:
chapter_heading = []
section_heading = []
subsection_heading = []
current_doc = []

last_font_size = 0

docs = {}
def stringify(thing):
    return " ".join(thing)

for p in range(doc.pages):
    page = doc.create_page(p)
    for b in page.text_list(page.TextListOption.text_list_include_font):
        font_size = b.get_font_size()

        if font_size < last_font_size:
            current_doc.append("\n--------\n")

        if font_size < 20:
            if font_size > last_font_size:
                current_doc.append("\n")
            current_doc.append(b.text)

        if font_size > last_font_size:
            if font_size > 20:
                # end of section
                docs[f"{stringify(chapter_heading)}{(' - ' + stringify(section_heading)) if section_heading else ''}"] = f"{stringify(chapter_heading)}\n\n{stringify(section_heading)}\n{stringify(current_doc)}"
                current_doc = []
                section_heading = []
                if font_size > 30:
                    # end of chapter
                    chapter_heading = []
            elif font_size > 18:
                # end of sub-subsection
                docs[f"{stringify(chapter_heading)} - {stringify(section_heading)} - {stringify(subsection_heading)}"] = f"{stringify(chapter_heading)}\n\n{stringify(section_heading)}\n{stringify(subsection_heading)}\n{stringify(current_doc)}"
                current_doc = []
                subsection_heading = []            

        if font_size > 30 and font_size < 35:
            chapter_heading.append(b.text)

        if font_size > 20 and font_size < 30:
            section_heading.append(b.text)
        elif font_size > 18 and font_size < 20:
            subsection_heading.append(b.text)

        last_font_size = font_size
docs

In [None]:
from utils.elastic import elastic_request
import requests
test = elastic_request(method=requests.post, url="_ingest/pipeline/clean_and_embed/_simulate", data={"docs":[{"_source": {"content": docs["Chapter 1: Step- By-Step Characters"]}}]})
test.json()

In [None]:
def ingest(doc, title, index):
    cleaned_title = title.replace("?", "").replace("/", "-")
    rslt = elastic_request(method=requests.put,
                           url=f"{index}/_doc/{cleaned_title}?pipeline=clean_and_embed",
                           data={"content": doc})
    return rslt


In [None]:
for title, item in docs.items():
    if title:
        try:
            rslt = ingest(item, title, "basic-rules-2014")
            rslt.raise_for_status()
        except Exception as e:
            print("Error: ", e)
            print(rslt.json())
            raise