In [1]:
# IMPORT LIBRARIES
# general utilities
import sys
import numpy as np
import random
import os
import glob
import matplotlib.pyplot as plt
import PIL
from pathlib import Path
from collections import Counter
from datetime import datetime
from PIL import Image

# deep learning frameworks
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms


# data processing, loading and splitting
from torchvision.io import read_image
from torchvision.transforms import Compose, Resize, ToTensor
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split 

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")

Device: mps


In [3]:
# CORRUPTED IMAGE DETECTION (to remove before training)
image_folder = "dataset/train"
corrupted_files = []

for file in os.listdir(image_folder):
    if file.endswith(('jpg', 'png')):
        file_path = os.path.join(image_folder, file)
        try:
            img = Image.open(file_path)
            img.verify() 
        except Exception:
            corrupted_files.append(file_path)

print(f"Corrupted images: {corrupted_files}")

Corrupted images: []


In [4]:
# DATASET CLASS
class CaptchaDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.image_files = [f for f in os.listdir(image_folder) if f.endswith(('jpg', 'png'))]
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_folder, self.image_files[idx])
        image = Image.open(img_path).convert("L")  # Convert to grayscale
        label = os.path.splitext(self.image_files[idx])[0]  # Extract label from filename

        # Apply the transform to the image (convert to tensor, normalize, etc.)
        if self.transform:
            image = self.transform(image)
        else:
            image = transforms.ToTensor()(image)  # Default conversion to tensor

        # Convert label to indices (assuming 62 classes)
        label_tensor = torch.tensor([self.char_to_index(c) for c in label])
        return image, label_tensor

    def char_to_index(self, char):
        if char.isdigit():  # '0'-'9' -> 0-9
            return int(char)
        elif 'A' <= char <= 'Z':  # 'A'-'Z' -> 10-35
            return ord(char) - ord('A') + 10
        elif 'a' <= char <= 'z':  # 'a'-'z' -> 36-61
            return ord(char) - ord('a') + 36
        else:
            raise ValueError(f"Invalid character: {char}")


# Image transformations (preprocessing)
transform = transforms.Compose([
    transforms.Resize((64, 128)),  # Resize to uniform size
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize((0.5,), (0.5,)),  # Normalize to mean 0.5, std 0.5
])

In [5]:
#TRAINING and VALIDATION
# Paths to dataset
train_dataset_path = "dataset/train"
val_dataset_path = "dataset/val"
test_dataset_path = "dataset/test"
# dataset initialization
train_dataset = CaptchaDataset(train_dataset_path, transform=transform)
val_dataset = CaptchaDataset(val_dataset_path, transform=transform)
# data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [6]:
# CNN DEFINITON
class CaptchaSolverCNN(nn.Module):
    def __init__(self):
        super(CaptchaSolverCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc_layers = nn.Sequential(
        nn.Linear(128 * 8 * 16, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 5 * 62)  # 5-character CAPTCHA with 62 possible classes (0-9, A-Z, a-z)
)

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc_layers(x)
        return x.view(-1, 5, 62) 

In [7]:
# MODEL AND LOSS FUNCTION INITIALIZATION
model = CaptchaSolverCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
# TRAINING FUNCTION
def train(model, train_loader, val_loader, epochs=20):
    device = "mps" if torch.backends.mps.is_available() else "cpu"  # Ensure the correct device is set
    model.to(device)  # Move the model to MPS

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Training loop
        for images, labels in train_loader:
            images = images.to(device)  
            labels = labels.to(device) 

            optimizer.zero_grad() 
            outputs = model(images)
            loss = sum(criterion(outputs[:, i], labels[:, i]) for i in range(5))
            loss.backward() 
            optimizer.step()  
            running_loss += loss.item()

        # Validation step
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)  # Move validation images to MPS
                labels = labels.to(device)  # Move validation labels to MPS

                outputs = model(images)
                val_loss += sum(criterion(outputs[:, i], labels[:, i]).item() for i in range(5))
                preds = outputs.argmax(dim=2)  # Get predictions
                correct += (preds.cpu() == labels.cpu()).all(dim=1).sum().item()  # Compare predictions
                total += labels.size(0)

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}, "
              f"Validation Loss: {val_loss/len(val_loader):.4f}, "
              f"Accuracy: {100 * correct/total:.2f}%")



In [9]:
train(model, train_loader, val_loader)

Epoch [1/20], Loss: 18.1815, Validation Loss: 14.5192, Accuracy: 0.15%
Epoch [2/20], Loss: 14.4477, Validation Loss: 11.9827, Accuracy: 0.91%
Epoch [3/20], Loss: 12.8517, Validation Loss: 10.5220, Accuracy: 2.10%
Epoch [4/20], Loss: 11.8544, Validation Loss: 9.6562, Accuracy: 3.25%
Epoch [5/20], Loss: 11.1618, Validation Loss: 9.0675, Accuracy: 4.22%
Epoch [6/20], Loss: 10.6364, Validation Loss: 8.7015, Accuracy: 5.12%
Epoch [7/20], Loss: 10.2191, Validation Loss: 8.4826, Accuracy: 5.97%
Epoch [8/20], Loss: 9.8609, Validation Loss: 8.0876, Accuracy: 6.79%
Epoch [9/20], Loss: 9.5729, Validation Loss: 7.9409, Accuracy: 7.60%
Epoch [10/20], Loss: 9.2912, Validation Loss: 7.7116, Accuracy: 8.30%
Epoch [11/20], Loss: 9.0658, Validation Loss: 7.8226, Accuracy: 8.28%
Epoch [12/20], Loss: 8.8762, Validation Loss: 7.5882, Accuracy: 8.75%
Epoch [13/20], Loss: 8.6776, Validation Loss: 7.3886, Accuracy: 10.01%
Epoch [14/20], Loss: 8.5135, Validation Loss: 7.4213, Accuracy: 9.71%
Epoch [15/20], Los