In [1]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox,LTRect

In [2]:
pages = extract_pages("skaven.pdf")
pages = [page for page in pages]


In [42]:
SPEARHEAD_WARSCROLL = "spearhead_warscroll"
WARSCROLL = "warscroll"
UNKNOWN = "unknown"

def page_type(page):
    for element in page:
        if not isinstance(element, LTTextBox):
            continue

        text = element.get_text()
        if "SPEARHEAD WARSCROLL" in text:
            return SPEARHEAD_WARSCROLL
            
        if "WARSCROLL" in text:
            return WARSCROLL

    return UNKNOWN

In [54]:
def get_name(page):
    """
    The unit name is always at the top of the scroll, and offset to the right
    """

    name = {}

    for element in page:
        if not isinstance(element, LTTextBox):
            continue

        mid_x = (element.x0 + element.x1) / 2
        if mid_x < 120:
            continue

        mid_y = (element.y0 + element.y1) / 2
        if mid_y < 380 or mid_y > 420:
            continue

        text = element.get_text()
        if "WARSCROLL" in text:
            continue

        name[mid_y] = " ".join(text.split())

    keys = list(name.keys())
    keys.sort(reverse=True)
    return ", ".join([name[k] for k in keys])

In [74]:
def get_characteristics(page):
    """
    Returns a tuple of (Move, Health, Save, Control, Banishment)
    """

    move, health, save, control, banishment = "", "", "", "", ""
    
    for element in page:
        if not isinstance(element, LTTextBox):
            continue

        # Everything is in the top left, so ignore everything else
        if element.y0 < 380 or element.x1 > 75:
            continue

        text = "".join(element.get_text().strip().split())
        if "MOVE" in text:
            move = text.removeprefix("MOVE")
            continue

        if "CONTROL" in text:
            control = text.removesuffix("CONTROL")
            continue

        if "NISH" in text:
            banishment = text[:text.index("+")+1]
            continue

        if "+" in text:
            save = text
            continue

        if text.isnumeric():
            health = text
            continue
            
    return move, health, save, control, banishment

In [31]:
def get_keywords(page):
    """
    Keywords always appear at the bottom of the page, with y coordinates less than 40, so we just grab the text boxes that
    match
    """

    keywords = []

    for element in page:
        if not isinstance(element, LTTextBox):
            continue

        if element.y0 > 40 or element.y1 > 40:
            continue

        text = element.get_text()
        if "KEYWORDS" in text:
            continue

        keywords.extend(text.strip().split(", "))

    return keywords

In [3]:
print("Rectangles:")
for element in pages[10]:
    if isinstance(element, LTRect):
        print(f"({element.x0},{element.y0}) -> ({element.x1}, {element.y1})")

print("\nContents:")
for element in pages[10]:
    if isinstance(element, LTTextBox):
        print(f"({element.x0},{element.y0}) -> ({element.x1}, {element.y1}): {element.get_text()}")


Rectangles:
(157.802,149.59699999999998) -> (203.308, 159.628)
(157.552,159.87800000000001) -> (158.052, 299.547)
(293.163,159.87800000000001) -> (293.663, 299.547)
(157.552,299.547) -> (203.308, 300.047)
(203.309,299.547) -> (293.663, 300.047)
(9.434,282.787) -> (145.545, 283.287)
(9.434,211.788) -> (9.934, 282.788)
(145.045,211.788) -> (145.545, 282.788)

Contents:
(39.220200000000006,416.9619) -> (55.436329, 436.87175440000004): M OVE
10"

(21.3133974,403.8253428) -> (27.327472, 422.44721209999994): H
T
L
A
E

(27.674200000000006,402.4029) -> (38.90395000000001, 413.9029): 13

(22.7762478,400.12360390000003) -> (28.951172, 401.6449199): H

(67.4937747,414.9632188) -> (72.9683531, 416.4501572): S

(56.2287,402.4029) -> (68.7522, 413.9029): 4+

(68.1712728,402.840102) -> (73.4139026, 413.7721883): A
V
E

(32.8498109,381.3475192) -> (58.4438185, 398.4584): 5
CONT R O L

(17.3162,341.03090000000003) -> (90.4007, 359.5376): RANGED WEAPONS
Doomstar

(20.3289,318.3538) -> (87.3878999999999

In [88]:
def assemble_rectangles(page):
    rectangles = []
    for element in page:
        if isinstance(element, LTRect):
            rectangles.append((round(element.x0), round(element.x1), round(element.y0), round(element.y1)))

    rectangles.sort(key=lambda r: r[0])
    for r in rectangles:
        print(r)
    
    groups = []
    seen = set([])
    for i, r in enumerate(rectangles):
        if i in seen:
            continue
        
        current_group = set([(r[0], r[2]), (r[1], r[3])])
        for j, p in enumerate(rectangles[i+1:]):
            if (p[0], p[2]) in current_group or (p[1], p[3]) in current_group:
                seen.add(i+j+1)
                current_group.add((p[0], p[2]))
                current_group.add((p[1], p[3]))

        print("Adding group:", current_group)
        groups.append(current_group)

    return groups

In [79]:
print("Rectangles:")
for element in pages[10]:
    if isinstance(element, LTRect):
        print(f"({element.x0},{element.y0}) -> ({element.x1}, {element.y1})")

Rectangles:
(157.967,180.337) -> (203.473, 197.368)
(157.717,295.618) -> (203.473, 296.118)
(157.717,197.618) -> (158.217, 295.618)
(203.474,295.618) -> (293.828, 296.118)
(293.328,197.618) -> (293.828, 295.618)
(9.434,273.487) -> (9.934, 311.048)
(145.045,273.487) -> (145.545, 311.048)
(9.434,311.048) -> (145.545, 311.548)
(9.434,255.205) -> (145.545, 255.705)
(9.434,193.205) -> (9.934, 255.205)
(145.045,193.205) -> (145.545, 255.205)
(157.554,49.902) -> (158.054, 162.571)
(293.165,49.902) -> (293.665, 162.571)
(157.554,162.571) -> (293.66499999999996, 163.071)
(9.682,55.49700000000001) -> (55.188, 65.528)
(9.432,65.77800000000002) -> (9.932, 175.116)
(145.043,65.77800000000002) -> (145.543, 175.116)
(9.432,175.116) -> (55.188, 175.616)
(55.189,175.116) -> (145.543, 175.616)


In [89]:
assemble_rectangles(pages[6])

(9, 10, 273, 311)
(9, 146, 311, 312)
(9, 146, 255, 256)
(9, 10, 193, 255)
(9, 10, 66, 175)
(9, 55, 175, 176)
(10, 55, 55, 66)
(55, 146, 175, 176)
(145, 146, 273, 311)
(145, 146, 193, 255)
(145, 146, 66, 175)
(158, 203, 180, 197)
(158, 203, 296, 296)
(158, 158, 198, 296)
(158, 158, 50, 163)
(158, 294, 163, 163)
(203, 294, 296, 296)
(293, 294, 198, 296)
(293, 294, 50, 163)
Adding group: {(10, 311), (9, 273)}
Adding group: {(146, 312), (9, 311)}
Adding group: {(146, 256), (9, 255)}
Adding group: {(9, 193), (10, 255)}
Adding group: {(9, 66), (10, 175)}
Adding group: {(55, 176), (9, 175)}
Adding group: {(10, 55), (55, 66)}
Adding group: {(55, 175), (146, 176)}
Adding group: {(146, 311), (145, 273)}
Adding group: {(146, 255), (145, 193)}
Adding group: {(146, 175), (145, 66)}
Adding group: {(158, 180), (203, 197)}
Adding group: {(203, 296), (158, 296), (294, 296), (158, 198), (293, 198)}
Adding group: {(158, 163), (294, 163), (293, 50), (158, 50)}


[{(9, 273), (10, 311)},
 {(9, 311), (146, 312)},
 {(9, 255), (146, 256)},
 {(9, 193), (10, 255)},
 {(9, 66), (10, 175)},
 {(9, 175), (55, 176)},
 {(10, 55), (55, 66)},
 {(55, 175), (146, 176)},
 {(145, 273), (146, 311)},
 {(145, 193), (146, 255)},
 {(145, 66), (146, 175)},
 {(158, 180), (203, 197)},
 {(158, 198), (158, 296), (203, 296), (293, 198), (294, 296)},
 {(158, 50), (158, 163), (293, 50), (294, 163)}]