In [1]:
import json
import os
from PIL import Image
import pytesseract

In [2]:
VERTICAL_LETTERS = 32

In [3]:
def load_columns(directory="./out/shred_pairs"):
    images_data = []
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".bmp"):
            # Extract parts between underscores
            parts = filename.split('.')[0].split('_')
            assert (len(parts) == 3)
            indices = [int(part) for part in parts[1:]]
            # Open the image
            image_path = os.path.join(directory, filename)
            image = Image.open(image_path)
            images_data.append((indices, image))

    return images_data

In [4]:
columns = load_columns()

In [5]:
#columns[0]

In [6]:
# for c in columns:
#     if c[0][1] == 189:
#         print(c)

In [7]:
def letter_at_idx_from_column(column, index, total_vertical_letters=VERTICAL_LETTERS):
    # Ensure the index is valid
    if index < 0 or index >= total_vertical_letters:
        raise IndexError("Index out of range")

    # Get the dimensions of the image
    width, height = column.size

    # Calculate the height of each sub-image
    letter_img_height = height // total_vertical_letters

    # Calculate the top and bottom of the sub-image
    top = index * letter_img_height
    bottom = (index + 1) * letter_img_height if index < total_vertical_letters - 1 else height

    # Extract and return the sub-image
    letter_img = column.crop((0, top, width, bottom))

    # Configure pytesseract to whitelist specific characters
    custom_config = r'-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ.,\  --psm 10'

    # Use pytesseract to recognize the single letter
    text = pytesseract.image_to_string(letter_img, config=custom_config)

    # Return the detected text, stripped of any extra whitespace
    txt_clean = text.strip().upper()

    if txt_clean == "":
        txt_clean = " "

    return txt_clean

In [8]:
def column_to_letters(column):
    letters = [letter_at_idx_from_column(column[1], i) for i in range(0, VERTICAL_LETTERS)]
    return letters

In [9]:
def columns_to_matrix_of_letters(columns):
    matrix = []
    for col in columns:
        matrix.append(column_to_letters(col))
    return matrix

In [10]:
letters_matrix = columns_to_matrix_of_letters(columns)

In [12]:
with open("./out/letters_matrix.json", 'w') as jf:
    json.dump(letters_matrix, jf, indent=4)