Extract the zip file

In [None]:
import zipfile  # import the standard library module for handling ZIP archives

zip_path = 'ocr_dataset.zip'  # path to the zip file containing the dataset
tmp_dir = 'ocr_dataset'   # directory where the zip will be extracted

with zipfile.ZipFile(zip_path, 'r') as zf:  # open the zip file for reading as 'zf'
    zf.extractall(tmp_dir)  # extract all files from the archive into the tmp_dir

In [None]:
import random  # random number utilities for reproducibility
from pathlib import Path  # object-oriented filesystem paths

import torch  # PyTorch main package
import torch.nn as nn  # neural network modules and layers
import torch.optim as optim  # optimization algorithms (e.g., SGD, Adam)
from torch.utils.data import DataLoader, random_split  # data loading and dataset splitting utilities
from torchvision import datasets, transforms  # vision datasets and transform helpers
from torchvision.transforms import InterpolationMode  # interpolation modes for Resize
from PIL import Image  # Pillow image loading/manipulation
import numpy as np  # numerical arrays and utilities

In [None]:
DATA_DIR = "ocr_dataset/ocr_dataset"   # root folder where ImageFolder will look for class subfolders
MODEL_PATH = "ocr_cnn.pth"  # file path where the trained model checkpoint will be saved/loaded

In [None]:
image_size = 128         # resize all images to (image_size, image_size)
batch_size = 32          # number of samples per training batch
num_workers = 2          # number of subprocesses for data loading (0 on some systems)
learning_rate = 0.00008  # Adam optimizer learning rate
num_epochs = 50          # total training epochs
validation_split = 0.2   # fraction of dataset for testing or validation
seed = 42                # random seed for reproducibility
num_classes = 10         # number of output classes (digits 0-9)

In [None]:
torch.manual_seed(seed)  # seed PyTorch RNG
random.seed(seed)  # seed Python RNG
np.random.seed(seed)  # seed NumPy RNG

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # choose GPU if available else CPU
print("Device:", device)  # print selected device for confirmation

Device: cuda


In [None]:
transform = transforms.Compose([
    # ---- Data Augmentation (only applied during training) ----
    transforms.RandomRotation(degrees=(-45, 45)),               # randomly rotate images within this degree range
    transforms.RandomAffine(0, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10), # apply random translation/scale/shear
    transforms.ColorJitter(brightness=0.2, contrast=0.2), # randomly adjust brightness and contrast

    # ---- Base Preprocessing ----
    transforms.Grayscale(num_output_channels=1),  # convert images to single-channel grayscale
    transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BILINEAR),  # resize to fixed square
    transforms.ToTensor(),                       # convert PIL image to tensor in [0,1]
    transforms.Normalize((0.5,), (0.5,))         # normalize tensor to approx [-1, 1]
])

In [None]:
dataset = datasets.ImageFolder(root=DATA_DIR, transform=transform)  # create dataset from folder-structured images
num_total = len(dataset)  # total number of image samples
if num_total == 0:
    raise RuntimeError(f"No images found in {DATA_DIR}. Make sure dataset is structured correctly.")  # error out if no images found

print(f"Found {num_total} images across {len(dataset.classes)} classes: {dataset.classes}")  # report counts and class names

Found 5000 images across 10 classes: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [None]:
val_size = int(validation_split * num_total)  # number of samples for validation
train_size = num_total - val_size  # remaining samples for training
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])  # random split into train/val subsets

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers)  # DataLoader for training (shuffled)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        num_workers=num_workers)  # DataLoader for validation (no shuffle)

In [None]:
class OCR_CNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()  # initialize parent nn.Module
        # Convolutional blocks
        # Each conv keeps spatial sizes if padding=1 with kernel=3
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),  # conv: 1->32 channels, preserves spatial size
            nn.ReLU(inplace=True),  # activation
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # conv: 32->64 channels
            nn.ReLU(inplace=True),  # activation
            nn.MaxPool2d(kernel_size=2, stride=2),   # downsample spatial dims by 2

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  # conv: 64->128
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),  # conv: 128->128
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),   # downsample by another factor of 2

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # conv: 128->256
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),   # downsample spatial dims again

            # Make the spatial size fixed before FC using AdaptiveAvgPool (robust)
            nn.AdaptiveAvgPool2d((4, 4)),            # produce fixed 4x4 spatial output (256 x 4 x 4)
        )

        # Classifier (fully connected)
        self.classifier = nn.Sequential(
            nn.Flatten(),  # flatten feature map to vector
            nn.Linear(256 * 4 * 4, 512),  # FC layer from flattened features to hidden 512
            nn.ReLU(inplace=True),  # activation
            nn.Dropout(p=0.5),  # dropout for regularization
            nn.Linear(512, num_classes)  # final linear layer producing class logits
            # Note: no softmax here because CrossEntropyLoss expects raw logits
        )

    def forward(self, x):
        x = self.features(x)  # apply convolutional feature extractor
        x = self.classifier(x)  # apply classifier to get logits
        return x  # return raw logits

In [None]:
model = OCR_CNN(num_classes=num_classes).to(device)  # create model instance and move to device
print(model)  # print model architecture summary

OCR_CNN(
  (features): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (13): AdaptiveAvgPool2d(output_size=(4, 4))
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=4096, out_features=512, bias=True)
    (2

In [None]:
criterion = nn.CrossEntropyLoss()  # loss function for classification (expects logits)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer for model parameters

In [None]:
def compute_accuracy(model, data_loader, device):
    model.eval()  # set model to evaluation mode (disables dropout, etc.)
    correct = 0  # running count of correct predictions
    total = 0  # running count of total samples
    running_loss = 0.0  # cumulative loss across samples
    with torch.no_grad():  # disable gradient computation for speed/memory
        for images, labels in data_loader:  # iterate over batches
            images = images.to(device)  # move images to device
            labels = labels.to(device)  # move labels to device
            outputs = model(images)  # forward pass to get logits
            loss = criterion(outputs, labels)  # compute batch loss
            running_loss += loss.item() * images.size(0)  # accumulate total loss (scaled by batch size)
            _, predicted = torch.max(outputs, 1)  # get predicted class indices
            total += labels.size(0)  # add number of samples in batch
            correct += (predicted == labels).sum().item()  # count correct predictions

    avg_loss = running_loss / total  # average loss over dataset
    acc = 100.0 * correct / total  # accuracy percentage
    model.train()  # set model back to training mode
    return avg_loss, acc  # return avg loss and accuracy

In [None]:
best_val_acc = 0.0  # track best validation accuracy
best_epoch = -1  # track epoch of best validation accuracy

for epoch in range(num_epochs):  # loop over epochs
    model.train()  # set model to training mode
    running_loss = 0.0  # cumulative training loss for this epoch
    total_batches = 0  # number of processed batches

    for images, labels in train_loader:  # iterate over training batches
        images = images.to(device)  # move images to device
        labels = labels.to(device)  # move labels to device

        # forward
        outputs = model(images)  # compute logits for current batch
        loss = criterion(outputs, labels)  # compute loss for current batch

        # backward + optimize
        optimizer.zero_grad()  # clear previous gradients
        loss.backward()  # backpropagate to compute gradients
        optimizer.step()  # update model parameters

        running_loss += loss.item()  # accumulate loss value
        total_batches += 1  # increment batch counter

    train_loss = running_loss / total_batches  # average training loss for epoch
    val_loss, val_acc = compute_accuracy(model, val_loader, device)  # evaluate on validation set

    print(f"Epoch [{epoch+1}/{num_epochs}]  Train Loss: {train_loss:.4f}  Val Loss: {val_loss:.4f}  Val Acc: {val_acc:.2f}%")  # print epoch metrics

    # Save best model
    if val_acc > best_val_acc:  # if current validation accuracy improved
        best_val_acc = val_acc  # update best accuracy
        best_epoch = epoch + 1  # store epoch number (1-based)
        torch.save({  # save checkpoint dict
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'val_loss': val_loss,
        }, MODEL_PATH)  # write checkpoint to MODEL_PATH
        print(f"Saved best model (val_acc={val_acc:.2f}%) to {MODEL_PATH}")  # notify that model was saved

print(f"Training finished. Best val acc: {best_val_acc:.2f}% at epoch {best_epoch}")  # final summary

Epoch [1/50]  Train Loss: 2.3033  Val Loss: 2.3036  Val Acc: 11.70%
Saved best model (val_acc=11.70%) to ocr_cnn.pth
Epoch [2/50]  Train Loss: 2.3015  Val Loss: 2.2990  Val Acc: 12.40%
Saved best model (val_acc=12.40%) to ocr_cnn.pth
Epoch [3/50]  Train Loss: 2.2571  Val Loss: 2.2165  Val Acc: 17.90%
Saved best model (val_acc=17.90%) to ocr_cnn.pth
Epoch [4/50]  Train Loss: 2.1891  Val Loss: 2.1391  Val Acc: 21.80%
Saved best model (val_acc=21.80%) to ocr_cnn.pth
Epoch [5/50]  Train Loss: 2.0602  Val Loss: 1.9145  Val Acc: 33.80%
Saved best model (val_acc=33.80%) to ocr_cnn.pth
Epoch [6/50]  Train Loss: 1.7671  Val Loss: 1.6040  Val Acc: 50.00%
Saved best model (val_acc=50.00%) to ocr_cnn.pth
Epoch [7/50]  Train Loss: 1.5185  Val Loss: 1.2815  Val Acc: 59.80%
Saved best model (val_acc=59.80%) to ocr_cnn.pth
Epoch [8/50]  Train Loss: 1.3125  Val Loss: 1.1469  Val Acc: 64.40%
Saved best model (val_acc=64.40%) to ocr_cnn.pth
Epoch [9/50]  Train Loss: 1.1959  Val Loss: 1.0154  Val Acc: 68.

In [None]:
checkpoint = torch.load(MODEL_PATH, map_location=device)  # load saved checkpoint from disk
model.load_state_dict(checkpoint['model_state_dict'])  # load model weights into architecture
model.to(device)  # ensure model is on the correct device
final_val_loss, final_val_acc = compute_accuracy(model, val_loader, device)  # compute final validation metrics
print(f"Final loaded model validation acc: {final_val_acc:.2f}%  loss: {final_val_loss:.4f}")  # print results

Final loaded model validation acc: 95.40%  loss: 0.1514


In [None]:
softmax = nn.Softmax(dim=1)  # softmax to convert logits to probabilities

def predict_image(img_path_or_pil, model_path=MODEL_PATH, device=device, transform=transform):
    """
    Predict single image. Returns (predicted_label, probs_array)
    """
    # Load model
    chk = torch.load(model_path, map_location=device)  # load checkpoint from disk
    model = OCR_CNN(num_classes=num_classes)  # create a fresh model instance
    model.load_state_dict(chk['model_state_dict'])  # populate model weights
    model.to(device)  # move model to device
    model.eval()  # set to evaluation mode

    # Load image (path or PIL)
    if isinstance(img_path_or_pil, (str, Path)):  # if input is a filepath
        img = Image.open(img_path_or_pil)  # open image from path
    elif isinstance(img_path_or_pil, Image.Image):  # if input is already a PIL Image
        img = img_path_or_pil  # use it directly
    else:
        raise ValueError("img_path_or_pil should be path or PIL.Image.Image")  # invalid input type

    # Apply same transforms as training
    x = transform(img).unsqueeze(0)  # apply transforms and add batch dimension
    x = x.to(device)  # move tensor to device

    with torch.no_grad():  # no gradient computation needed
        logits = model(x)  # forward pass to get logits
        probs = softmax(logits).cpu().numpy()[0]  # convert probabilities to NumPy array (first batch element)
        pred = int(np.argmax(probs))  # get index of max probability as predicted label

    return pred, probs  # return predicted label and probability array

In [None]:
def quick_val_check(n=8):
    # picks n examples from validation set and prints predictions
    model.eval()  # set model to evaluation mode
    examples = []  # placeholder list (not used further here)
    for i in range(min(n, len(val_dataset))):  # iterate up to n examples or dataset length
        img, label = val_dataset[i]  # retrieve transformed tensor and label from Subset
        # val_dataset stores PIL->tensor via transform, but random_split wraps it inside Subset,
        # so val_dataset[i][0] is already a Tensor; to predict we need PIL, so we can access raw dataset
        # workaround: get path from dataset.samples if needed. Simpler: re-open image from original path.
        orig_idx = val_dataset.indices[i]  # map subset index back to original dataset index
        img_path, true_label = dataset.samples[orig_idx]  # get original file path and true label
        pred, probs = predict_image(img_path)  # predict using the saved model
        print(f"Image: {img_path}  True: {true_label}  Pred: {pred}  Prob: {probs[pred]:.3f}")  # print prediction info
        

In [None]:
quick_val_check()  # run the quick validation examples (default n=8)

Image: ocr_dataset/ocr_dataset/2/0003_637126272186778260_0480.jpg  True: 2  Pred: 2  Prob: 1.000
Image: ocr_dataset/ocr_dataset/5/0006_637126267907833540_0747.jpg  True: 5  Pred: 5  Prob: 0.997
Image: ocr_dataset/ocr_dataset/5/0006_637126271661867598_0332.jpg  True: 5  Pred: 5  Prob: 0.989
Image: ocr_dataset/ocr_dataset/6/0007_637126271856521117_0007.jpg  True: 6  Pred: 6  Prob: 1.000
Image: ocr_dataset/ocr_dataset/6/0007_637126272222403482_0633.jpg  True: 6  Pred: 6  Prob: 0.992
Image: ocr_dataset/ocr_dataset/1/0002_637126268996325608_0499.jpg  True: 1  Pred: 1  Prob: 0.969
Image: ocr_dataset/ocr_dataset/6/0007_637126266032878368_0705.jpg  True: 6  Pred: 6  Prob: 0.770
Image: ocr_dataset/ocr_dataset/7/0008_637126267531646744_0293.jpg  True: 7  Pred: 7  Prob: 1.000


In [None]:
predict_image("/content/ocr_dataset/ocr_dataset/9/0010_637126265594351740_0713.jpg")  # example: predict a single image by path

(9,
 array([4.6273966e-12, 1.6856518e-03, 3.6896169e-10, 2.4738334e-14,
        8.4351526e-10, 4.4085520e-15, 4.8951427e-17, 2.7166262e-11,
        3.7355444e-14, 9.9831438e-01], dtype=float32))

In [None]:
# predict_image('/content/bengali-numbers-1-design-set-vector.jpg')  # example of how to call predict_image with another image (kept commented)

(1,
 array([0.3723848 , 0.45614922, 0.02515879, 0.09946348, 0.00751282,
        0.01782199, 0.01123653, 0.00365526, 0.00191113, 0.00470609],
       dtype=float32))