In [11]:
import fitz
import pandas as pd
import cv2
import json
# allow pandas to display everything in ipynb
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

doc_id = "a13245ab-97c7-4724-a7b5-2d3f403a561a"

metadata_path = f"/Users/bigo/Projects/timbergem/data/processed/{doc_id}/symbols/symbols_metadata.json"
project_data_path = f"/Users/bigo/Projects/timbergem/data/processed/{doc_id}/project_data.json"
document_path = f"/Users/bigo/Projects/timbergem/data/processed/{doc_id}/original.pdf"
legend_path = f"/Users/bigo/Projects/timbergem/data/processed/{doc_id}/clippings/page1/SymbolLegend_1_clipping.png"

"""
Debug the symbol coordinates in the legend by loading the legend image and applying the coordinates in metadata.symbols[].coordinates.legend_relative (utilizing left, top, width, height). Saves a clipping from the legend image in "symbol_debug"
"""
def debug_symbol_coordinates():
    with open(metadata_path, "r") as f:
        metadata_json = json.load(f)
    symbols = metadata_json["symbols"]
    metadata = pd.DataFrame(symbols)
    legend_image = cv2.imread(legend_path)

    for i, symbol in metadata.iterrows():
        coordinates = symbol['coordinates']['legend_relative']
        left = coordinates['left']
        top = coordinates['top']
        width = coordinates['width']
        height = coordinates['height']
        # extract the symbol clipping from the legend image
        symbol_clipping = legend_image[top:top + height, left:left + width]
        # save the symbol clipping as an image
        symbol_image_path = f"symbol_debug/image_symbol_{i}.png"
        cv2.imwrite(symbol_image_path, symbol_clipping)
        print(f"Saved symbol clipping to {symbol_image_path}")
debug_symbol_coordinates()


Saved symbol clipping to symbol_debug/image_symbol_0.png


In [12]:

"""
calculates the symbol dimensions in 300 dpi based on the contours of the symbol clippings
references each symbol in the metadata json "symbols" list. Utilizes "source_legend.page_number"(1-indexed) and "coordinates.pdf_absolute" to find the symbol clipping in the pdf.
prints the height and width of each symbol
saves the contour clippings in "symbol_debug" dir in root of the project
"""
def calculate_symbol_dimensions():
    with open(metadata_path, "r") as f:
        metadata_json = json.load(f)
    symbols = metadata_json["symbols"]
    metadata = pd.DataFrame(symbols)
    document = fitz.open(document_path)

    for i, symbol in metadata.iterrows():
        page_number = symbol['source_legend']['page_number'] - 1  # convert to 0-indexed
        coordinates = symbol['coordinates']['pdf_absolute']
        legend_id = symbol['source_legend']['legend_id']
        # find the coords/dims of the source legend in project_data.json
        with open(project_data_path, "r") as f:
            project_data = json.load(f)
        key_areas = project_data["projectData"]['keyAreas']
        for k, v in key_areas.items():
            for area in v:
                if area["id"] == legend_id:
                    area_left, area_top, area_width, area_height = area["left"], area["top"], area["width"], area["height"]
                    break
        if not area_left or not area_top or not area_width or not area_height:
            print(f"Legend ID {legend_id} not found in project_data.json")
            continue
        # pdf_absolute contains left, top, width, and height
        left = coordinates['left'] + area_left
        top = coordinates['top'] + area_top
        width = coordinates['width']
        height = coordinates['height']
        # calculate the dimensions in 300 dpi
        dpi = 300
        width_in_inches = width / dpi
        height_in_inches = height / dpi
        print(f"Symbol {i}: Width: {width_in_inches:.2f} inches, Height: {height_in_inches:.2f} inches")
        # calculate the dimensions in 300dpi pixels
        width_in_pixels = int(width * dpi / 72)
        height_in_pixels = int(height * dpi / 72)
        print(f"Symbol {i}: Width: {width_in_pixels} pixels, Height: {height_in_pixels} pixels")
        # extract the symbol clipping from the pdf
        page = document[page_number]
        symbol_rect = fitz.Rect(left, top, left + width, top + height)
        area_rect = fitz.Rect(area_left, area_top, area_left + area_width, area_top + area_height)
        # draw the rectangle on the page for debugging
        page.draw_rect(area_rect, color=(0, 1, 0), width=1)
        page.draw_rect(symbol_rect, color=(1, 0, 0), width=1)
        pix = page.get_pixmap(dpi=dpi)
        # save the symbol clipping as an image
        symbol_image_path = f"symbol_debug/pdf_symbol_{i}.png"
        pix.save(symbol_image_path)
        print(f"Saved symbol clipping to {symbol_image_path}")

calculate_symbol_dimensions()


Symbol 0: Width: 0.02 inches, Height: 0.02 inches
Symbol 0: Width: 19 pixels, Height: 18 pixels
Saved symbol clipping to symbol_debug/pdf_symbol_0.png


In [3]:
"""
"left": 275.5,
"top": 45.359375,
"width": 193,
"height": 179,
"""
import fitz

# load /Users/bigo/Projects/timbergem/data/Gasc_annot_debug.pdf and draw a rectangle at (275.5, 45.359375) with width 193 and height 179. Save the page as an image in "symbol_debug" dir
def draw_debug_rectangle():
    doc = fitz.open("/Users/bigo/Projects/timbergem/data/Gasc_annot_debug.pdf")
    page = doc[0]  # first page
    left = 275.5
    top = 45.359375
    width = 193
    height = 179
    rect = fitz.Rect(left, top, left + width, top + height)
    rect = fitz.Rect(595, 98, 1011, 484)
    # draw the rectangle on the page for debugging
    page.draw_rect(rect, color=(1, 0, 0), width=1)
    pix = page.get_pixmap(dpi=300)
    # save the page as an image
    debug_image_path = "symbol_debug/debug_rectangle.png"
    pix.save(debug_image_path)
    print(f"Saved debug rectangle image to {debug_image_path}")
draw_debug_rectangle()

Saved debug rectangle image to symbol_debug/debug_rectangle.png
