In [50]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import uuid
from ipynb.fs.defs.normalize_and_segment import normalize_and_segment
from utils import *
import torch, torchvision
import shutil
from PIL import Image

In [14]:
train_dir = os.getcwd() + "/../train"
train_data_dir = os.getcwd() + "/../train_chars"
val_dir = os.getcwd() + "/../test"
val_data_dir = "/../test_chars"
model_weights_path = os.path.join(os.getcwd(), '../model/char_cnn.pth')

In [15]:
def segment_captcha_into_characters(input_dir, output_dir):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    path_ext = "-0.png"
    files = [
        f for f in os.listdir(input_dir)
        if f.lower().endswith(path_ext)
    ]

    segment_fail_count = 0

    for filename in files:
        label = get_label(filename)
        img_path = os.path.join(input_dir, f"{label}-0.png")
        img = cv2.imread(img_path)
        if img is None:
            continue
        chars = normalize_and_segment(img)

        if len(chars) != len(label):
            segment_fail_count += 1
            continue
            
        for ch, ch_img in zip(label, chars):
            # Ensure grayscale uint8
            if ch_img.dtype == bool:
                ch_img = ch_img.astype("uint8") * 255
            elif ch_img.dtype in (np.float32, np.float64):
                ch_img = (ch_img * 255).clip(0, 255).astype("uint8")
            else:
                ch_img = ch_img.astype("uint8")
            # Ensure each output is 64x64 (just in case)
            ch_img = cv2.resize(ch_img, (64, 64), interpolation=cv2.INTER_AREA)

            unique_id = uuid.uuid4().hex[:8]
            out_name = f"{ch}/{unique_id}.png"
            os.makedirs(f"{output_dir}/{ch}", exist_ok=True)
            cv2.imwrite(os.path.join(output_dir, out_name), ch_img)

    failure_rate = segment_fail_count / len(files) * 100
    print(f"Finished creating dataset: {segment_fail_count} images wrongly segmented ({failure_rate}%)")



### Process CAPTCHAs into individual characters

In [16]:
# uncomment to run, takes a while
#segment_captcha_into_characters(train_dir, train_data_dir)
#segment_captcha_into_characters(val_dir, val_data_dir)

### Load Dataset

In [51]:
from torch.utils.data import Dataset

class CaptchaCharDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        root_dir: directory with segmented images
        transform: torchvision transforms for data augmentation / tensor conversion
        """
        self.root_dir = root_dir
        self.transform = transform

        self.samples = []

        # Collect folder names (classes)
        classes = sorted(os.listdir(root_dir))
        self.char_to_idx = {c: i for i, c in enumerate(classes)}
        self.idx_to_char = {i: c for c, i in self.char_to_idx.items()}

        for c in classes:
            class_dir = os.path.join(root_dir, c)
            if not os.path.isdir(class_dir):
                continue

            for fname in os.listdir(class_dir):
                if fname.lower().endswith(".png"):
                    full_path = os.path.join(class_dir, fname)
                    self.samples.append((full_path, c))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, char_label = self.samples[idx]
        label = self.char_to_idx[char_label]

        img = cv2.imread(img_path, 0)  # grayscale
        if img is None:
            raise ValueError(f"Failed to load image: {img_path}")
        
        if isinstance(img, np.ndarray):
            img = Image.fromarray(img.astype('uint8'), mode='L')

        if self.transform:
            img = self.transform(img)
        else:
            img = torch.tensor(img, dtype=torch.float32).unsqueeze(0) / 255.

        return img, label

### Model

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CharCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),

        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 8 * 8, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [73]:
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.optim as optim

transform = transforms.Compose([
    transforms.ToTensor(),
])

device = torch.device("cuda")

dataset = CaptchaCharDataset(train_data_dir, transform=transform)
train_loader = DataLoader(
    dataset,
    batch_size=256,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

num_classes = len(dataset.char_to_idx)
model = CharCNN(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [74]:
val_dataset = CaptchaCharDataset(val_data_dir, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [75]:
# for epoch in range(50):
#     model.train()
#     total_loss = 0

#     for images, labels in train_loader:
#         images = images.to(device)
#         images = (images - 0.5) / 0.5  
#         labels = labels.to(device)

#         optimizer.zero_grad()
#         logits = model(images)
#         loss = criterion(logits, labels)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

# torch.save(model.state_dict(), model_weights_path)

from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

model = CharCNN(num_classes=36).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)
best_val_acc = 0.0
patience = 5
wait = 0
num_epochs = 50

for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0.0
    train_correct = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * imgs.size(0)
        train_correct += (out.argmax(1) == labels).sum().item()

    train_loss /= len(dataset)
    train_acc = train_correct / len(dataset)

    model.eval()
    val_loss = 0.0
    val_correct = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            out = model(imgs)
            loss = criterion(out, labels)
            val_loss += loss.item() * imgs.size(0)
            val_correct += (out.argmax(1) == labels).sum().item()
    val_loss /= len(val_dataset)
    val_acc = val_correct / len(val_dataset)

    # scheduler based on val accuracy
    scheduler.step(val_acc)

    print(f"Epoch {epoch}: train_loss={train_loss:.4f} train_acc={train_acc:.4f}  "
          f"val_loss={val_loss:.4f} val_acc={val_acc:.4f}")

    # early stopping & save best
    if val_acc > best_val_acc + 1e-4:
        best_val_acc = val_acc
        torch.save(model.state_dict(), model_weights_path)
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered.")
            break

Epoch 1: train_loss=2.6555 train_acc=0.4042  val_loss=1.9540 val_acc=0.6381
Epoch 2: train_loss=1.7931 train_acc=0.6332  val_loss=1.3879 val_acc=0.7196
Epoch 3: train_loss=1.3476 train_acc=0.7108  val_loss=1.0858 val_acc=0.7669
Epoch 4: train_loss=1.0899 train_acc=0.7521  val_loss=0.9021 val_acc=0.7846
Epoch 5: train_loss=0.9291 train_acc=0.7775  val_loss=0.8026 val_acc=0.7995
Epoch 6: train_loss=0.8144 train_acc=0.7980  val_loss=0.7246 val_acc=0.8094
Epoch 7: train_loss=0.7170 train_acc=0.8158  val_loss=0.6672 val_acc=0.8199
Epoch 8: train_loss=0.6409 train_acc=0.8354  val_loss=0.6470 val_acc=0.8181
Epoch 9: train_loss=0.5723 train_acc=0.8498  val_loss=0.6060 val_acc=0.8273
Epoch 10: train_loss=0.5161 train_acc=0.8636  val_loss=0.5896 val_acc=0.8280
Epoch 11: train_loss=0.4674 train_acc=0.8755  val_loss=0.5794 val_acc=0.8313
Epoch 12: train_loss=0.4152 train_acc=0.8882  val_loss=0.5591 val_acc=0.8306
Epoch 13: train_loss=0.3789 train_acc=0.8982  val_loss=0.5584 val_acc=0.8303
Epoch 14

### Evaluate

In [None]:
model.load_state_dict(torch.load(model_weights_path, map_location=device))  

<All keys matched successfully>

In [76]:
from collections import defaultdict

class_correct = defaultdict(int)
class_total = defaultdict(int)

with torch.no_grad():
    for imgs, labels in val_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        preds = outputs.argmax(dim=1)

        for label, pred in zip(labels, preds):
            label_char = val_dataset.idx_to_char[label.item()]
            class_total[label_char] += 1
            if pred.item() == label.item():
                class_correct[label_char] += 1

for c in val_dataset.char_to_idx.keys():
    acc = class_correct[c] / class_total[c] if class_total[c] > 0 else 0
    print(f"Class {c}: {acc*100:.2f}%")

Class 0: 64.93%
Class 1: 73.25%
Class 2: 88.75%
Class 3: 91.60%
Class 4: 81.22%
Class 5: 77.69%
Class 6: 90.48%
Class 7: 90.04%
Class 8: 85.71%
Class 9: 84.88%
Class a: 80.65%
Class b: 82.28%
Class c: 89.64%
Class d: 78.11%
Class e: 87.75%
Class f: 89.39%
Class g: 77.91%
Class h: 90.11%
Class i: 55.83%
Class j: 86.27%
Class k: 83.21%
Class l: 75.47%
Class m: 93.77%
Class n: 88.06%
Class o: 53.67%
Class p: 94.65%
Class q: 78.28%
Class r: 81.08%
Class s: 77.99%
Class t: 87.90%
Class u: 91.57%
Class v: 85.16%
Class w: 94.56%
Class x: 91.50%
Class y: 88.36%
Class z: 88.40%


In [77]:
def eval_img(img, label):
    chars = normalize_and_segment(img)

    predicted_text = ""
    for char_img in chars:
        tensor_img = transform(char_img).unsqueeze(0).to(device)  # add batch dim
        with torch.no_grad():
            output = model(tensor_img)
            pred_idx = output.argmax(dim=1).item()
            pred_char = val_dataset.idx_to_char[pred_idx]  # adjust if needed
            predicted_text += pred_char
    return predicted_text == label

In [78]:
test_dir = os.getcwd() + "/../test/"
test_files = os.listdir(test_dir)

total = 0
correct = 0

for img_path in test_files:
    label = get_label(img_path)
    img = get_img(label, folder="/../test/")
    if img is None:
        print(label)
        continue
    if eval_img(img, label):
        correct += 1
    total += 1

print(correct, total, correct/total)

mkb7
698 1999 0.3491745872936468
