In [1]:
# Import necessary libraries
import base64
import json
import sys
import os
import fitz
import requests
import re
from fitz import Document, Page, Rect
from dotenv import load_dotenv

load_dotenv()
global_save_path = os.environ.get("SAVE_PATH") + "AUG_590_Straw_umgebaut/"
print(global_save_path)

/home/funny/Documents/Projects/230701-Parsing-Tool/augustin-plugin/sample_data/AUG_590_Straw_umgebaut/


In [2]:
src = fitz.open("../sample_data/AUG_590_Straw_umgebaut/AUG_590_Straw_umgebaut.pdf")
page = src.load_page(0)

## Extract meta data

In [14]:
def parse_strawanzerin_headline(page):
    """Parse the strawanzerin file for headlines."""
    headlines = []
    blocks = page.get_text("dict", flags=11)["blocks"]
    for block in blocks:
        for line in block["lines"]:
            for span in line["spans"]:
                if span["size"] > 30:
                    headlines += span["text"], span["bbox"], span["size"]

    return headlines


def parse_first_page(path_to_new_directory):
    """Parse the strawanzerin file."""

    # Get source file
    src = fitz.open("../sample_data/AUG_590_Straw_umgebaut/AUG_590_Straw_umgebaut.pdf")

    page = src.load_page(0)

    headlines = parse_strawanzerin_headline(page)

    if not headlines:
        print("Error: No headline found!")

    if headlines[3].lower() == "gratis":
        x0, y0, x1, y1 = headlines[4]
    else:
        raise ValueError("Error: No headline gratis found!")

    clip_regions = [
        (0, 0, page.rect.width, y0),
        (0, y0, 180, page.rect.height),
        (180, y0, 320, page.rect.height),
        (320, y0, 460, page.rect.height),
        (460, y0, page.rect.width, page.rect.height),
    ]
    text = ""
    for i, (x0, y0, x1, y1) in enumerate(clip_regions):
        clip_region = (x0, y0, x1, y1)
        text += page.get_text("text", clip=clip_region)

    return text

In [28]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2**0:
        l.append("superscript")
    if flags & 2**1:
        l.append("italic")
    if flags & 2**2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2**3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2**4:
        l.append("bold")
    return ", ".join(l)

In [35]:
def clean_text(text):
    """Parse the strawanzerin file for headlines."""
    headlines = []
    blocks = page.get_text("dict", flags=11)["blocks"]
    for block in blocks:
        for line in block["lines"]:
            for span in line["spans"]:
                print(
                    f'{span["text"]}, {span["size"]}, #{span["color"]:06x}, {flags_decomposer(span["flags"])}, {span["font"]}'
                )
                if span["size"] > 30:
                    headlines += span["text"], span["bbox"], span["size"]

    return headlines

In [36]:
text = parse_first_page(global_save_path)
text = clean_text(text)
text

Strawanzerin, 75.5, #221f1f, serifed, proportional, bold, MyriadPro-BoldCond
Veranstaltungs-,, 12.0, #221f1f, serifed, proportional, bold, MyriadPro-BoldCond
Radio- und TV-Tipps von, 12.0, #221f1f, serifed, proportional, bold, MyriadPro-BoldCond
05.02. bis 18.02.24, 12.0, #221f1f, serifed, proportional, bold, MyriadPro-BoldCond
Mo, 05.02. , 14.0, #ee4c8e, serifed, proportional, bold, MyriadPro-Bold
LeSung, 11.0, #221f1f, serifed, proportional, bold, MyriadPro-BoldCond
elias Hirschl liest «Content», 9.0, #221f1f, serifed, proportional, bold, MyriadPro-BoldCond
Die Ich-Erzählerin des Romans , 9.0, #221f1f, serifed, proportional, MyriadPro-Cond
arbeitet im Listicle-Department einer , 9.0, #221f1f, serifed, proportional, MyriadPro-Cond
Content-Farm, die sich direkt über einer , 9.0, #221f1f, serifed, proportional, MyriadPro-Cond
ehemaligen Kohlenzeche befindet. Da , 9.0, #221f1f, serifed, proportional, MyriadPro-Cond
prallen die schöne neue Arbeitswelt und , 9.0, #221f1f, serifed, proporti

['Strawanzerin',
 (112.17839813232422,
  71.88687896728516,
  468.08538818359375,
  163.09088134765625),
 75.5,
 'gratis',
 (32.5984001159668, 623.96923828125, 121.22139739990234, 675.9132690429688),
 43.0]

## Extracting first page

In [3]:
occurence = page.search_for("Gratis")

if len(occurence) != 1:
    print("Error: Found more than one or no occurence of 'Gratis' on the first page")
else:
    x0, y0, x1, y1 = occurence[0]

pix1 = page.get_pixmap(clip=(0, 0, page.rect.width, y0))
# Save the image to the new directory
name_png = f"{global_save_path}page-{page.number}1.png"
pix1.save(name_png)

pix2 = page.get_pixmap(clip=(0, y0, 180, page.rect.height))
# Save the image to the new directory
name_png = f"{global_save_path}page-{page.number}2.png"
pix2.save(name_png)

pix3 = page.get_pixmap(clip=(180, y0, 320, page.rect.height))
# Save the image to the new directory
name_png = f"{global_save_path}page-{page.number}3.png"
pix3.save(name_png)

pix4 = page.get_pixmap(clip=(320, y0, 460, page.rect.height))
# Save the image to the new directory
name_png = f"{global_save_path}page-{page.number}4.png"
pix4.save(name_png)

pix5 = page.get_pixmap(clip=(460, y0, page.rect.width, page.rect.height))
# Save the image to the new directory
name_png = f"{global_save_path}page-{page.number}5.png"
pix5.save(name_png)

## Extract following pages

In [4]:
# Read page text as a dictionary, suppressing extra spaces in CJK fonts

for index, page in enumerate(src):
    headlines = []
    if index == 0:
        continue
    blocks = page.get_text("dict", flags=11)["blocks"]
    for b in blocks:
        for l in b["lines"]:
            for s in l["spans"]:
                if s["size"] > 30:
                    print(s["text"], s["bbox"], s["size"])
                    print(f"Page number: {index}")
                    headlines += s["text"], s["bbox"], s["size"]
    if len(headlines) == 0:
        # Set three text parts for the three columns over the whole page
        pix1 = page.get_pixmap(clip=(0, 0, 180, page.rect.height))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}1.png"
        pix1.save(name_png)

        pix2 = page.get_pixmap(clip=(180, 0, 320, page.rect.height))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}2.png"
        pix2.save(name_png)

        pix3 = page.get_pixmap(clip=(320, 0, 460, page.rect.height))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}3.png"
        pix3.save(name_png)
    elif len(headlines) == 3:
        # Set six text parts in total
        # First three columns above the header for section above the header
        x0, y0, x1, y1 = headlines[1]
        pix1 = page.get_pixmap(clip=(0, 0, 160, y0))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}1.png"
        pix1.save(name_png)

        pix2 = page.get_pixmap(clip=(160, 0, 300, y0))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}2.png"
        pix2.save(name_png)

        pix3 = page.get_pixmap(clip=(300, 0, 440, y0))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}3.png"
        pix3.save(name_png)

        # Last three columns below the header for section below the header
        pix4 = page.get_pixmap(clip=(0, y0, 160, page.rect.height))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}4.png"
        pix4.save(name_png)

        pix5 = page.get_pixmap(clip=(160, y0, 300, page.rect.height))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}5.png"
        pix5.save(name_png)

        pix6 = page.get_pixmap(clip=(300, y0, 440, page.rect.height))
        # Save the image to the new directory
        name_png = f"{global_save_path}page-{page.number}6.png"
        pix6.save(name_png)

    elif len(headlines) == 6:
        print("There are two headlines on the page")
        # TODO implement the case where there are six headlines on the page

ab 7 euro (280.91339111328125, 234.12710571289062, 419.2013854980469, 286.07110595703125) 43.0
Page number: 2
Augenschmaus (33.7322998046875, 469.8849792480469, 244.6123046875, 518.2049560546875) 40.0
Page number: 3
Ohrwurm (33.7322998046875, 46.95587921142578, 166.8083038330078, 95.27587890625) 40.0
Page number: 3
There are six headlines on the page


In [5]:
# Hello

# Steps
1. Get image from first page, upload it and use it as featured_media image for post
2. Add category "strawanzerin" to post
## First page
1. Search for word "Gratis" and get its coordinates
2. Below the y-axis of the given coordinates extract text (either in get.text() or in four divided parts to have it in the correct)
3. Set the word "Gratis" as ## header
4. Let the dates either have the same colour as they are given or let them have a smaller header like ###
5. No clue what to do with "Tipp aus der Redaktion" -> for now add this whole text to the end of the post

## Next pages
1. Check for headlines by font size > 30. If found, look for next headers "ab 7 Euro, bis 7 Euro, Ohrwurm, Augenschmaus"
2. Same as in first page: Get coordinates from found header
3. "Virtually split" page by parsing through three text fields above the header (as part of the header section before) and three text fields below
   - Same option if two headers in one page 
4. Extract text -> Remove headline quotes such "bis 7 Euro", "Gratis" right after text extraction

## Most right text part
- Extract fourth (most right) text part: Add text of fourth part of the first page with text of fourth part following pages in one variable, after parsing for common headers
- Add first header to Gratis section
- Add headers with ab 7 euro to "Ab 7 Euro" section
- Add last information to the end of the whole post