# Setup

In [12]:
# =========================
# CLEAN + PINNED ENV (Colab)
# =========================
!pip -q uninstall -y tf-keras tensorflow-text tensorflow-decision-forests tensorflow==2.19.*  >/dev/null 2>&1

# Upgrade core tooling first
!pip -q install -U pip setuptools wheel >/dev/null

# Install a consistent stack:
# - numpy 2.x (required by OpenCV 4.12 and thinc)
# - tensorflow 2.20 (works with numpy 2.x; bundled tf.keras; no external tf-keras)
# - opencv-python-headless 4.12 (depends on numpy >=2)
!pip -q install "numpy>=2.0,<2.3" "tensorflow==2.20.0" "opencv-python-headless==4.12.0.88"

# Sanity check
import numpy, tensorflow as tf, cv2, sys
print("Python  :", sys.version)
print("NumPy   :", numpy.__version__)
print("TF      :", tf.__version__)
print("OpenCV  :", cv2.__version__)

# IMPORTANT: Colab may need a restart after binary upgrades.
print("\nIf you see old versions above, do: Runtime ▶ Restart runtime, then run all."

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m120.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m123.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-keras 2.19.0 requires tensorflow<2.20,>=2.19, but you have tensorflow 2.20.0 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4

# Config

In [13]:
# =========================================
# SECTION 1 — Config
# =========================================
# Image canvas
IMG_H   = 32          # normalized height
MAX_W   = 200         # maximum (padded) width
CHANNELS= 1

# Model/time steps (2× maxpool in width → width/4 time steps)
TIME_STEPS = MAX_W // 4

# Training
BATCH        = 64
EPOCHS       = 15
OVERFIT_TINY = 128      # set to 0 to skip tiny-overfit sanity test

# Alphabet — start simple; add only what you truly need
ALPHABET = string.digits + string.ascii_lowercase
char2idx = {c:i for i, c in enumerate(ALPHABET)}
idx2char = {i:c for i, c in enumerate(ALPHABET)}
BLANK    = len(ALPHABET)
N_CLASSES= len(ALPHABET) + 1  # + blank

def text_to_ids(t: str) -> np.ndarray:
    return np.array([char2idx[c] for c in t if c in char2idx], dtype=np.int32)

def ids_to_text(ids: List[int]) -> str:
    return "".join(idx2char[i] for i in ids)


# Image Preprocessing

In [None]:
# =========================================
# SECTION 2 — Image Preprocessing
# (Keep non-inverted first; you can try inversion later)
# =========================================
def load_gray(path: str) -> np.ndarray:
    """Read BGR and convert to single-channel grayscale."""
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(path)
    return img

def prep_np_image(np_img_gray: np.ndarray) -> np.ndarray:
    """
    Scale to fixed height; pad/crop to MAX_W; return [H, W, 1] float32 in [0,1].
    """
    h, w = np_img_gray.shape[:2]
    scale = IMG_H / max(h, 1)
    new_w = min(MAX_W, max(16, int(math.ceil(w * scale))))
    img = cv2.resize(np_img_gray, (new_w, IMG_H), interpolation=cv2.INTER_CUBIC)

    if new_w < MAX_W:
        pad = np.full((IMG_H, MAX_W - new_w), 255, np.uint8)
        img = np.hstack([img, pad])
    else:
        img = img[:, :MAX_W]

    img = img.astype(np.float32) / 255.0  # no inversion initially
    return img[..., None]                  # shape (H, W, 1)


# Load Synth90k

In [4]:
# Hugging Face mirror of Synth90k / MJSynth
# We'll use small subsets so the demo trains quickly.
train_ds_hf = load_dataset("priyank-m/MJSynth_text_recognition", split="train[:10000]")
val_ds_hf   = load_dataset("priyank-m/MJSynth_text_recognition", split="val[:2000]")

print(train_ds_hf, val_ds_hf)  # sanity

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/25 [00:00<?, ?files/s]

data/train-00000-of-00025-e0800a94f785d3(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00001-of-00025-095e9ac2cb5f0f(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00002-of-00025-fb450bf0c15eb4(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00003-of-00025-3303678227724c(…):   0%|          | 0.00/392M [00:00<?, ?B/s]

data/train-00004-of-00025-93158a416467d8(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00005-of-00025-0b57df213bf403(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00006-of-00025-244aba71596889(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00007-of-00025-a422424296a3c5(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00008-of-00025-caf6f054c55c15(…):   0%|          | 0.00/392M [00:00<?, ?B/s]

data/train-00009-of-00025-6ea1326c567bf9(…):   0%|          | 0.00/392M [00:00<?, ?B/s]

data/train-00010-of-00025-2483680f50dd94(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00011-of-00025-852a265caa0bfa(…):   0%|          | 0.00/392M [00:00<?, ?B/s]

data/train-00012-of-00025-c994e1aa7532d3(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00013-of-00025-25c6c8c92221aa(…):   0%|          | 0.00/392M [00:00<?, ?B/s]

data/train-00014-of-00025-1cfb1a4e0fafa0(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00015-of-00025-ab994b71db0115(…):   0%|          | 0.00/391M [00:00<?, ?B/s]

data/train-00016-of-00025-0cd115e0a310a5(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00017-of-00025-6c1d9981987212(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00018-of-00025-9285594ebc5163(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00019-of-00025-2e61fcbe6c2d0b(…):   0%|          | 0.00/392M [00:00<?, ?B/s]

data/train-00020-of-00025-8189068fe77cf3(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00021-of-00025-9ce3e12866abb7(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00022-of-00025-a39519f2abb035(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00023-of-00025-013e7f60ef13cb(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/train-00024-of-00025-ab890d4fa257f4(…):   0%|          | 0.00/393M [00:00<?, ?B/s]

data/test-00000-of-00003-93cdf7f5b5af49e(…):   0%|          | 0.00/403M [00:00<?, ?B/s]

data/test-00001-of-00003-1004494b2249dc5(…):   0%|          | 0.00/404M [00:00<?, ?B/s]

data/test-00002-of-00003-e00f940e1c5af5b(…):   0%|          | 0.00/403M [00:00<?, ?B/s]

data/val-00000-of-00003-1e8934522df1dd51(…):   0%|          | 0.00/363M [00:00<?, ?B/s]

data/val-00001-of-00003-e0f0985559c5db2f(…):   0%|          | 0.00/364M [00:00<?, ?B/s]

data/val-00002-of-00003-eb1a40d5ce584445(…):   0%|          | 0.00/363M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7224600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/891924 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/802733 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Dataset({
    features: ['image', 'label'],
    num_rows: 10000
}) Dataset({
    features: ['image', 'label'],
    num_rows: 2000
})


# Preprocess (grayscale → fixed H=32 + pad W=256)

In [5]:
def prep_np_image(np_img_gray):
    # np_img_gray: HxW uint8
    h, w = np_img_gray.shape[:2]
    scale = IMG_H / max(h, 1)
    new_w = min(max(16, int(np.ceil(w * scale))), MAX_W)
    img = cv2.resize(np_img_gray, (new_w, IMG_H), interpolation=cv2.INTER_CUBIC)

    # right-pad to MAX_W (white)
    if img.shape[1] < MAX_W:
        pad = np.full((IMG_H, MAX_W - img.shape[1]), 255, np.uint8)
        img = np.hstack([img, pad])
    else:
        img = img[:, :MAX_W]

    # normalize to [0,1]; invert sometimes helps Synth90k-like renders
    img = 1.0 - (img.astype(np.float32) / 255.0)
    return img[..., None]  # (H, W, 1)

def prepare_numpy(split):
    X, Y = [], []
    for ex in split:
        g = np.array(ex["image"].convert("L"))
        X.append(prep_np_image(g))
        Y.append(ex["label"])
    return np.stack(X), Y

X_train, y_train = prepare_numpy(train_ds_hf)
X_val,   y_val   = prepare_numpy(val_ds_hf)

print("Train:", X_train.shape, "Val:", X_val.shape)

Train: (10000, 32, 256, 1) Val: (2000, 32, 256, 1)


# Labels → integer ids (CTC blank = last; no padding with 0)

In [6]:
def text_to_ids(s: str):
    return [char2idx[c] for c in s if c in char2idx]

def ids_to_text(seq):
    return "".join(idx2char.get(i, "") for i in seq)

# For CTC, we DON'T need to pad labels to a fixed length in memory.
# We'll keep variable-length labels and let tf.data pad with -1 (special for CTC).
train_labels_ids = [np.array(text_to_ids(t), np.int32) for t in y_train]
val_labels_ids   = [np.array(text_to_ids(t), np.int32) for t in y_val]

# Sequence length after CNN: with two (2,2) pools along width -> MAX_W//4
TIME_STEPS = MAX_W // 4


# tf.data pipelines (repeat + explicit steps to avoid “ran out of data”)

In [7]:
def make_ds(X, Y_ids, batch, shuffle=False):
    def gen():
        for img, lab in zip(X, Y_ids):
            yield {
                "image": img.astype(np.float32),
                "label": lab.astype(np.int32),
                "input_length": np.array([TIME_STEPS], np.int32),
                "label_length": np.array([len(lab)], np.int32),
            }, np.zeros((1,), np.float32)  # dummy target

    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "image":        tf.TensorSpec([IMG_H, MAX_W, 1], tf.float32),
                "label":        tf.TensorSpec([None],            tf.int32),
                "input_length": tf.TensorSpec([1],               tf.int32),
                "label_length": tf.TensorSpec([1],               tf.int32),
            },
            tf.TensorSpec([1], tf.float32),
        )
    )
    if shuffle:
        ds = ds.shuffle(buffer_size=len(Y_ids), reshuffle_each_iteration=True)
    ds = ds.padded_batch(
        batch,
        padded_shapes=(
            {
                "image":        [IMG_H, MAX_W, 1],
                "label":        [None],      # ragged -> padded
                "input_length": [1],
                "label_length": [1],
            },
            [1]
        ),
        # IMPORTANT: pad labels with -1 so CTC ignores padding
        padding_values=(
            {
                "image":        tf.constant(0.0, tf.float32),
                "label":        tf.constant(-1,  tf.int32),
                "input_length": tf.constant(0,   tf.int32),
                "label_length": tf.constant(0,   tf.int32),
            },
            tf.constant(0.0, tf.float32)
        ),
        drop_remainder=True
    )
    return ds.repeat().prefetch(tf.data.AUTOTUNE)

train_ds = make_ds(X_train, train_labels_ids, BATCH, shuffle=True)
val_ds   = make_ds(X_val,   val_labels_ids,   BATCH, shuffle=False)

steps_per_epoch    = len(train_labels_ids) // BATCH
validation_steps   = len(val_labels_ids)   // BATCH
steps_per_epoch, validation_steps

(156, 31)

# Model: CNN → BiLSTM → Dense (softmax). CTC loss inside graph

In [8]:
from tensorflow.keras import layers, models, backend as K

# Inputs
image_in       = layers.Input(name="image",        shape=(IMG_H, MAX_W, 1), dtype="float32")
label_in       = layers.Input(name="label",        shape=(None,),          dtype="int32")
input_len_in   = layers.Input(name="input_length", shape=(1,),             dtype="int32")
label_len_in   = layers.Input(name="label_length", shape=(1,),             dtype="int32")

# CNN backbone (downsample width by ~4)
x = layers.Conv2D(64, 3, padding="same", activation="relu")(image_in)
x = layers.MaxPool2D((2,2))(x)                 # W/2
x = layers.Conv2D(128, 3, padding="same", activation="relu")(x)
x = layers.MaxPool2D((2,2))(x)                 # W/4
x = layers.Conv2D(256, 3, padding="same", activation="relu")(x)

# Collapse height -> time along width
x = layers.Permute((2,1,3))(x)                 # (B, W', H', C)
x = layers.TimeDistributed(layers.Flatten())(x)

# Sequence modeling
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

# Logits → softmax; keep dtype float32 for numerical stability with CTC
logits = layers.Dense(N_CLASSES, activation="linear")(x)
y_pred = layers.Activation("softmax", dtype="float32", name="softmax")(logits)

# CTC loss layer
def ctc_loss_layer(args):
    y_true, y_pred, in_len, lab_len = args
    # Keras expects labels padded with -1; it will mask those positions.
    return K.ctc_batch_cost(y_true, y_pred, in_len, lab_len)

loss_out = layers.Lambda(ctc_loss_layer, name="ctc_loss")(
    [label_in, y_pred, input_len_in, label_len_in]
)

# Train model (outputs the loss value)
train_model = models.Model(
    inputs=[image_in, label_in, input_len_in, label_len_in],
    outputs=loss_out
)
pred_model  = models.Model(inputs=image_in, outputs=y_pred)

train_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss={"ctc_loss": lambda y_true, y_pred: y_pred}
)

train_model.summary()

# Train

In [9]:
history = train_model.fit(
    train_ds,
    validation_data=val_ds,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    epochs=EPOCHS,
    verbose=1
)


Epoch 1/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 64ms/step - loss: 51.5660 - val_loss: 30.8708
Epoch 2/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - loss: 30.5390 - val_loss: 30.4240
Epoch 3/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - loss: 30.1254 - val_loss: 30.2385
Epoch 4/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - loss: 30.0223 - val_loss: 30.0675
Epoch 5/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - loss: 30.1000 - val_loss: 30.0224
Epoch 6/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - loss: 29.9356 - val_loss: 30.0227
Epoch 7/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - loss: 29.9608 - val_loss: 29.9551
Epoch 8/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 57ms/step - loss: 29.7881 - val_loss: 29.8486
Epoch 9/1

# Inference utils (greedy decode) + Evaluation (word accuracy)

In [10]:
def greedy_decode(probs):  # probs: (1, T, N_CLASSES)
    seq = np.argmax(probs[0], axis=-1).tolist()
    out, prev = [], BLANK
    for t in seq:
        if t != prev and t != BLANK:
            out.append(t)
        prev = t
    return ids_to_text(out)

# Quick sample predictions
for i in random.sample(range(len(X_val)), 5):
    p = pred_model.predict(X_val[i][None, ...], verbose=0)
    print("GT :", y_val[i])
    print("PD :", greedy_decode(p), "\n")

# Word accuracy on the (small) validation subset
correct = 0
for i in range(len(X_val)):
    p = pred_model.predict(X_val[i][None, ...], verbose=0)
    if greedy_decode(p) == y_val[i]:
        correct += 1
acc = correct / len(X_val)
print(f"Validation word accuracy: {acc:.3f}")


GT : Dashingly
PD : C 

GT : SLAVEHOLDER
PD : C 

GT : arsonist
PD : C 

GT : GIGGED
PD : C 

GT : catt
PD : C 

Validation word accuracy: 0.000


# Save prediction model to .h5

In [11]:
# Save only the prediction model (for easy loading in your project)
pred_model.save("ocr_crnn_synth90k_pred.h5")
print("Saved:", "ocr_crnn_synth90k_pred.h5")




Saved: ocr_crnn_synth90k_pred.h5
