In [288]:
from collections import defaultdict
from pathlib import Path
import pickle
from typing import List, Dict, Optional

from PIL import Image, ImageDraw

In [220]:
EXTENDED_DATA_PATH = Path("/tmp/knn/crawled-data-v2.5/extended_output_data")
COLORS = ["red", "blue", "green", "orange", "pink", "violet"]
OFFSETS = defaultdict(lambda: (0, 0), {
    'aha': (-7, 0),
    'auto': (-7, 0),
    'avmania': (-7, 0),
    'blesk': (-7, 0),
    'connect': (-7, 0),
    'doupe': (-7, 0),
    'e15': (-6, 0),
    'idnes': (-6, 0),
    'isport': (-6, 0),
    'lidovky': (-6, 0),
    'lupa': (-6, 0),
    'mobilmania': (-6, 0),
    'pravda': (6, 0),
    'sme': (-6, 0),
    'vtm': (-7, 0),
    'zive': (-6, 0)
})

In [62]:
def apply_offset(offset: List[float], bounding_box: List[float]):
    x = offset[0]
    y = offset[1]

    return [bounding_box[0] + x, bounding_box[1] + y, bounding_box[2] + x, bounding_box[3] + y]

In [291]:
def apply_higher_level_fixes(website_name: str, bounding_boxes_value: Dict[str, Optional[List[float]]]) -> Dict[str, Optional[List[float]]]:
    date_bounding_box = bounding_boxes_value['date_published']
    author_bounding_box = bounding_boxes_value['author_name']

    # Zive and child servers:
    if website_name in ["zive", "doupe", "vtm", "mobilmania", "connect", "avmania"]:
        if author_bounding_box[0] >= date_bounding_box[0] and author_bounding_box[1] >= date_bounding_box[1] and author_bounding_box[2] <= date_bounding_box[2] and author_bounding_box[3] <= date_bounding_box[3]:
            # Fix published date bounding box, when there is author name bounding box in it
            date_bounding_box[0] = author_bounding_box[2]
        else:
            # Fix published date bounding box, when it is full-width sized, so part of the profile picture is in there
            date_bounding_box[0] = author_bounding_box[0]

    # Pravda: fix published date bounding box, when there is author name bounding box in it
    if website_name == "pravda":
        if author_bounding_box[0] >= date_bounding_box[0] and author_bounding_box[1] >= date_bounding_box[1] and author_bounding_box[2] - 1 <= date_bounding_box[2] and author_bounding_box[3] - 1 <= date_bounding_box[3]:
            date_bounding_box[1] = author_bounding_box[3]

    # Sme: remove parent references (
    if website_name == "sme":
        bounding_boxes_value['parent_reference'] = None

    bounding_boxes_value['date_published'] = date_bounding_box

    return bounding_boxes_value

In [292]:
def draw_bounding_boxes(website_name: str, article_id: int, page: int):
    with open(f"{EXTENDED_DATA_PATH}/{website_name}/{article_id}/bounding-boxes/{page}.pickle", 'rb') as f:
        bounding_boxes_data = pickle.load(f)
    image = Image.open(f"{EXTENDED_DATA_PATH}/{website_name}/{article_id}/screenshot/{page}.png")

    for _, bounding_boxes_value in bounding_boxes_data.items():
        bounding_boxes_value = apply_higher_level_fixes(website_name, bounding_boxes_value)

        color_counter = 0
        for section_name, bounding_box in bounding_boxes_value.items():
            if section_name not in ['wrapper', "date_published", "text", "author_name", "parent_reference"]:
                continue
            if bounding_box is None:
                continue

            bounding_box = apply_offset(OFFSETS[website_name], bounding_box)
            text_position = [bounding_box[0], bounding_box[1] - 11]

            draw_bounding_box = ImageDraw.Draw(image)
            draw_bounding_box.rectangle(bounding_box, outline=COLORS[color_counter])
            draw_bounding_box.text(text_position, section_name, fontsize=5, fill=COLORS[color_counter])

            color_counter += 1

    image.show()

In [294]:
draw_bounding_boxes("vtm", 9, 1)