In [12]:
import os
import string
import numpy as np
from scipy.io import loadmat, savemat
from PIL import Image
from tqdm import tqdm    # progress bar

MAX_IMAGES = 20000 # set MAX_IMAGES to -1 for unlimited.

SYNTH_ROOT   = "datasets/SynthText"
ORIG_GT      = os.path.join(SYNTH_ROOT, "gt.mat")
RESIZED_ROOT = "datasets/SynthText_sub_resized_4yolo" #currently named as it should
IMG_DIR      = os.path.join(RESIZED_ROOT, "images")
RESIZED_GT   = os.path.join(RESIZED_ROOT, "gt.mat")

# only digits + ASCII letters
ALLOWED = set(string.digits + string.ascii_letters)

# target size
TARGET_SIZE = (448, 448)

os.makedirs(IMG_DIR, exist_ok=True) #makes the directory if it doesnt exist

In [10]:
def extract_chars(txt_list):
    """
    Function name: extract_chars
    Description: Given a list of raw substrings, return all non-whitespace characters in order.
    Parameters:
        txt_list (list[str]): List of substrings or single string segments from gt['txt'].
    Return Value:
        list[str]: List of characters from all segments, excluding spaces or line breaks.
    """
    out = []
    for seg in txt_list:
        for c in str(seg):
            if not c.isspace():
                out.append(c)
    return out

def is_allowed(chars, allowed_set=ALLOWED):
    """
    Function name: is_allowed
    Description: Check whether every character in the list is in the allowed set.
    Parameters:
        chars (list[str]): List of characters to validate.
        allowed_set (set[str]): Set of permitted characters (digits + ASCII letters).
    Return Value:
        bool: True if all characters are in allowed_set, False otherwise.
    """
    return all(c in allowed_set for c in chars)

def split_and_clean(substrings):
    """
    Function name: split_and_clean
    Description: Split raw substrings on whitespace and strip each piece to form clean words.
    Parameters:
        substrings (list[str]): Raw segments from gt['txt'], possibly containing spaces or newlines.
    Return Value:
        list[str]: List of cleaned words with no leading/trailing whitespace.
    """
    words = []
    for seg in substrings:
        for w in str(seg).split():
            cw = w.strip()
            if cw:
                words.append(cw)
    return words


In [3]:
data     = loadmat(ORIG_GT, squeeze_me=True, struct_as_record=False)
imnames  = list(data['imnames'].flatten())   # e.g. "8/ballet_106_38.jpg"
charBBs  = list(data['charBB'].flatten())     # each shape (2,4,n_chars)
wordBBs  = list(data['wordBB'].flatten())     # each shape (2,4,n_words)
raw_txts = list(data['txt'].flatten())        # each is np.ndarray or str


In [15]:
new_imnames, new_charBB, new_wordBB, new_txt = [], [], [], []

for idx in tqdm(range(len(imnames)), desc="Resizing..."):
    name    = imnames[idx]
    raw_txt = raw_txts[idx]
    # 1. turn raw_txt into a Python list of substrings
    if isinstance(raw_txt, np.ndarray):
        substrs = [str(x) for x in raw_txt.flatten()]
    else:
        substrs = [str(raw_txt)]

    # 2. flatten all chars & filter
    chars = extract_chars(substrs)
    if not is_allowed(chars):
        continue

    bb  = charBBs[idx]
    if bb.shape[2] != len(chars):
        continue

    # 3. open + resize image
    src_path = os.path.join(SYNTH_ROOT, name)
    if not os.path.isfile(src_path):
        continue
    img       = Image.open(src_path).convert("RGB")
    w0, h0    = img.size
    scale_x   = TARGET_SIZE[0] / w0
    scale_y   = TARGET_SIZE[1] / h0
    img_res   = img.resize(TARGET_SIZE, Image.BILINEAR)

    # 4. scale charBB
    bb2        = bb.astype(float)
    bb2[0, :, :] *= scale_x
    bb2[1, :, :] *= scale_y

    # 5. split substrings to cleaned words + per-word char counts
    words       = split_and_clean(substrs)
    char_counts = [len(w) for w in words]
    if sum(char_counts) != bb2.shape[2]:
        # fallback: treat entire line as one word
        words       = [''.join(chars)]
        char_counts = [len(chars)]

    # 6. rebuild wordBB by grouping charBB
    word_boxes = []
    start = 0
    for cnt in char_counts:
        idxs = list(range(start, start+cnt))
        xs   = bb2[0, :, idxs].flatten()
        ys   = bb2[1, :, idxs].flatten()
        x0, x1 = xs.min(), xs.max()
        y0, y1 = ys.min(), ys.max()
        # four corners: (x0,y0),(x1,y0),(x1,y1),(x0,y1)
        word_boxes.append(np.array([[x0, x1, x1, x0],
                                    [y0, y0, y1, y1]], dtype=float))
        start += cnt
    wb2 = np.stack(word_boxes, axis=2)  # shape (2,4,len(words))

    # 7. save resized image
    out_name = os.path.basename(name)
    dst_path = os.path.join(IMG_DIR, out_name)
    img_res.save(dst_path)

    # 8. record
    new_imnames.append(out_name)
    new_charBB.append(bb2)
    new_wordBB.append(wb2)
    new_txt.append(words)
    
    # 9. limit the dataset generation. May cause the progress bar to end abruptly.
    if MAX_IMAGES != -1 and len(new_imnames) >= MAX_IMAGES:
        break

#  end loop


Resizing...:  10%|█████▉                                                       | 82834/858750 [03:23<31:47, 406.80it/s]


In [14]:
N = len(new_imnames)
im_arr   = np.empty((1, N), dtype=object)
cbb_arr  = np.empty((1, N), dtype=object)
wbb_arr  = np.empty((1, N), dtype=object)
txt_arr  = np.empty((1, N), dtype=object)

for i in range(N):
    im_arr[0,i]  = new_imnames[i]
    cbb_arr[0,i] = new_charBB[i]
    wbb_arr[0,i] = new_wordBB[i]
    txt_arr[0,i] = np.array(new_txt[i], dtype=object)

savemat(RESIZED_GT, {
    'imnames': im_arr,
    'charBB':  cbb_arr,
    'wordBB':  wbb_arr,
    'txt':     txt_arr,
}, do_compression=True)

print(f"Wrote {N} entries to {RESIZED_GT}")


Wrote 20000 entries to datasets/SynthText_sub_resized_TESTIN\gt.mat
