In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import uuid
from ipynb.fs.defs.normalize_and_segment import normalize_and_segment
from utils import *
import torch, torchvision
import shutil

In [2]:
train_dir = os.getcwd() + "/../train"
train_data_dir = os.getcwd() + "/../train_chars"
val_dir = os.getcwd() + "/../test"
val_data_dir = "/../test_chars"
model_weights_path = os.path.join(os.getcwd(), '../model/char_cnn.pth')

In [3]:
def segment_captcha_into_characters(input_dir, output_dir):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    path_ext = "-0.png"
    files = [
        f for f in os.listdir(input_dir)
        if f.lower().endswith(path_ext)
    ]

    segment_fail_count = 0

    for filename in files:
        label = get_label(filename)
        img_path = os.path.join(input_dir, f"{label}-0.png")
        img = cv2.imread(img_path)
        if img is None:
            continue
        chars = normalize_and_segment(img)

        if len(chars) != len(label):
            segment_fail_count += 1
            continue
            
        for ch, ch_img in zip(label, chars):
            # Ensure grayscale uint8
            if ch_img.dtype == bool:
                ch_img = ch_img.astype("uint8") * 255
            elif ch_img.dtype in (np.float32, np.float64):
                ch_img = (ch_img * 255).clip(0, 255).astype("uint8")
            else:
                ch_img = ch_img.astype("uint8")
            # Ensure each output is 64x64 (just in case)
            ch_img = cv2.resize(ch_img, (64, 64), interpolation=cv2.INTER_AREA)

            unique_id = uuid.uuid4().hex[:8]
            out_name = f"{ch}/{unique_id}.png"
            os.makedirs(f"{output_dir}/{ch}", exist_ok=True)
            cv2.imwrite(os.path.join(output_dir, out_name), ch_img)

    failure_rate = segment_fail_count / len(files) * 100
    print(f"Finished creating dataset: {segment_fail_count} images wrongly segmented ({failure_rate}%)")



### Process CAPTCHAs into individual characters

In [4]:
# uncomment to run, takes a while
#segment_captcha_into_characters(train_dir, train_data_dir)
#segment_captcha_into_characters(val_dir, val_data_dir)

### Load Dataset

In [5]:
from torch.utils.data import Dataset

class CaptchaCharDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        root_dir: directory with segmented images
        transform: torchvision transforms for data augmentation / tensor conversion
        """
        self.root_dir = root_dir
        self.transform = transform

        self.samples = []

        # Collect folder names (classes)
        classes = sorted(os.listdir(root_dir))
        self.char_to_idx = {c: i for i, c in enumerate(classes)}
        self.idx_to_char = {i: c for c, i in self.char_to_idx.items()}

        for c in classes:
            class_dir = os.path.join(root_dir, c)
            if not os.path.isdir(class_dir):
                continue

            for fname in os.listdir(class_dir):
                if fname.lower().endswith(".png"):
                    full_path = os.path.join(class_dir, fname)
                    self.samples.append((full_path, c))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, char_label = self.samples[idx]
        label = self.char_to_idx[char_label]

        img = cv2.imread(img_path, 0)  # grayscale
        if img is None:
            raise ValueError(f"Failed to load image: {img_path}")

        if self.transform:
            img = self.transform(img)
        else:
            img = torch.tensor(img, dtype=torch.float32).unsqueeze(0) / 255.

        return img, label

### Model

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CharCNN(nn.Module):
    def __init__(self, num_classes):
        super(CharCNN, self).__init__()

        self.conv_block1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)   # 64 -> 32
        )

        self.conv_block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)   # 32 -> 16
        )

        self.conv_block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)   # 16 -> 8
        )

        self.fc1 = nn.Linear(128 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)

        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [8]:
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.optim as optim

transform = transforms.Compose([
    transforms.ToTensor(),
])

device = torch.device("cuda")

dataset = CaptchaCharDataset(train_data_dir, transform=transform)
train_loader = DataLoader(
    dataset,
    batch_size=512,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

val_loader = DataLoader(dataset, batch_size=64, shuffle=False)


num_classes = len(dataset.char_to_idx)
model = CharCNN(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# for epoch in range(10):
#     model.train()
#     total_loss = 0

#     for images, labels in train_loader:
#         images = images.to(device)
#         images = (images - 0.5) / 0.5  
#         labels = labels.to(device)

#         optimizer.zero_grad()
#         logits = model(images)
#         loss = criterion(logits, labels)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

# torch.save(model.state_dict(), model_weights_path)

Epoch 1: Loss = 0.8777
Epoch 2: Loss = 0.6935
Epoch 3: Loss = 0.6003
Epoch 4: Loss = 0.5353
Epoch 5: Loss = 0.4714
Epoch 6: Loss = 0.4247
Epoch 7: Loss = 0.3792
Epoch 8: Loss = 0.3377
Epoch 9: Loss = 0.3008
Epoch 10: Loss = 0.2550


### Evaluate

In [9]:
val_dataset = CaptchaCharDataset(val_data_dir, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [11]:
model.load_state_dict(torch.load(model_weights_path, map_location=device))  

<All keys matched successfully>

In [12]:
from collections import defaultdict

class_correct = defaultdict(int)
class_total = defaultdict(int)

with torch.no_grad():
    for imgs, labels in val_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        preds = outputs.argmax(dim=1)

        for label, pred in zip(labels, preds):
            label_char = val_dataset.idx_to_char[label.item()]
            class_total[label_char] += 1
            if pred.item() == label.item():
                class_correct[label_char] += 1

for c in val_dataset.char_to_idx.keys():
    acc = class_correct[c] / class_total[c] if class_total[c] > 0 else 0
    print(f"Class {c}: {acc*100:.2f}%")

Class 0: 14.55%
Class 1: 47.74%
Class 2: 82.50%
Class 3: 81.30%
Class 4: 80.41%
Class 5: 65.29%
Class 6: 82.68%
Class 7: 80.91%
Class 8: 74.06%
Class 9: 83.72%
Class a: 75.99%
Class b: 55.12%
Class c: 87.25%
Class d: 76.23%
Class e: 86.96%
Class f: 84.08%
Class g: 64.34%
Class h: 87.07%
Class i: 45.94%
Class j: 67.06%
Class k: 81.75%
Class l: 41.13%
Class m: 80.54%
Class n: 68.28%
Class o: 85.71%
Class p: 92.59%
Class q: 78.28%
Class r: 70.72%
Class s: 74.25%
Class t: 77.58%
Class u: 83.94%
Class v: 77.39%
Class w: 85.03%
Class x: 89.47%
Class y: 85.45%
Class z: 74.40%


In [None]:
def eval_img(img, label):
    chars = normalize_and_segment(img)

    predicted_text = ""
    for char_img in chars:
        tensor_img = transform(char_img).unsqueeze(0).to(device)  # add batch dim
        with torch.no_grad():
            output = model(tensor_img)
            pred_idx = output.argmax(dim=1).item()
            pred_char = val_dataset.idx_to_char[pred_idx]  # adjust if needed
            predicted_text += pred_char
    return predicted_text == label

In [18]:
test_dir = os.getcwd() + "/../test/"
test_files = os.listdir(test_dir)

total = 0
correct = 0

for img_path in test_files:
    label = get_label(img_path)
    img = get_img(label, folder="/../test/")
    if img is None: 
        continue
    if eval_img(img, label):
        correct += 1
    total += 1

print(correct, total, correct/total)


385 1999 0.19259629814907453
