In [14]:
import sys, os

sys.path.append("/home/funny/Documents/Projects/230701-Parsing-Tool/augustin-plugin/")
from dotenv import load_dotenv
import fitz


In [15]:
# Load environment variables
load_dotenv()
global_save_path = os.environ.get("SAVE_PATH")
path_to_file = (
    global_save_path + "570_augustin_fertig_kontrolle/570_augustin_fertig_kontrolle.pdf"
)
src = fitz.open(path_to_file)

In [16]:
def extract_headlines(
    page, starting_characters=None, headlines=None, searching_for_end=False
):
    """Extract headlines from a PDF page."""
    ending_symbols = 0
    text_instances = page.get_text("dict", sort=True)["blocks"]

    if not text_instances:
        return headlines, starting_characters, ending_symbols

    for text in text_instances:
        try:
            for line in text["lines"]:
                for span in line["spans"]:
                    font_size = span["size"]
                    colour_code = f"#{span['color']:06x}"

                    # Check if colour code is not black or dark brown since starting characters have version colour
                    if colour_code != "#2e2013" and colour_code != "#000000":
                        # ------ Starting characters check ------
                        # Check if text is not a number, has less than 3 characters and is not ending character
                        if (
                            not span["text"].isdigit()
                            and len(span["text"]) < 3
                            and span["text"] != "■"
                        ):
                            print(f"Extracting starting character: {span['text']}")
                            if starting_characters is None:
                                starting_characters = []
                            if not searching_for_end:
                                starting_characters.append(span["text"].strip())
                        # ------ Ending symbol check ------
                        elif "■" in span["text"]:
                            print(f"Extracting ending symbol: {span['text']}")
                            ending_symbols += 1

                    # ------ Header check ------
                    # Check if font is bold and font size is larger 12
                    elif font_size > 12 or span["font"] == "AmasisMTStd-Bold":
                        print(f"Extracting headline: {span['text'].strip()}")
                        if headlines is None:
                            headlines = []
                        if not searching_for_end:
                            headlines.append(span["text"].strip())
        except KeyError:
            pass

    return headlines, starting_characters, ending_symbols


In [19]:
def clean_text(raw_text, starting_characters):
    """Clean the text from unwanted newlines and hyphens."""
    found_starting_character = False

    article = ""

    for line in raw_text.split("\n"):
        if found_starting_character:
            # Clean text from unwanted hyphens and add newlines, if line is not empty
            if line:
                # If in the end of line is "." or "!" or "?" or ":" keep newline
                if line[-1] in [".", "!", "?", ":"]:
                    article += line + "\n"
                    continue
                # If in the end of line is "-" delete hyphen and extra space before it
                elif line and line[-1] == "-":
                    article += line[:-1]
                    continue
            # If no checks are met, add line to article
            article += line
        # Start extracting article text with first starting character
        if starting_characters[0] == line:
            article += line
            found_starting_character = True

    # Format the string
    article = list(article)
    article_edit = article
    for index, letter in enumerate(article):
        if " " in letter and " " in article[index - 1]:
            del article_edit[index]

    return "".join(article_edit)

In [24]:
for index, page in enumerate(src):
    if index == 2:
        headlines, starting_characters, ending_symbols = extract_headlines(page)
        print(f"Headlines: {headlines}")
        print(f"Starting characters: {starting_characters}")
        print(f"Ending symbols: {ending_symbols}")
        article = clean_text(page.get_text("text"), starting_characters)
        print(f"Article: {article}")

Extracting headline: 3
Extracting headline: Bernd Pegritz
Extracting headline: «Rätsel» lösen mit Bildern
Extracting starting character: A
Extracting headline: Mehr Spaß
Extracting headline: macht’s auf
Extracting headline: Papier
Extracting ending symbol: ■
Headlines: ['3', 'Bernd Pegritz', '«Rätsel» lösen mit Bildern', 'Mehr Spaß', 'macht’s auf', 'Papier']
Starting characters: ['A']
Ending symbols: 1
Article: Anstatt die Hausaufgaben zu machen, habe ich als Kind mal das Logo von Mozilla Firefox nachgezeichnet. Irgendwann habe ich aufgeschnappt, dass es Leute gibt, die dafür bezahlt werden. Die Entscheidung Grafikdesigner zu werden, fiel früh, lange bevor ich wusste, was das genau ist.
In Tirol geboren und aufgewachsen, zog ich für das Masterstudium in Kommunikationsdesign nach Saarbrücken in Deutschland. Im Studium habe ich immer mehr Illustration eingebaut. Nun geht die Entwicklung weiter mit Animation, weil ich es spannend finde, wenn die Zeichnungen zum Leben erwachen und man in e