<a href="https://colab.research.google.com/github/adithyaac/Captcha-breaker/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [105]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

IMAGE_SIZE = (150, 50)
BATCH_SIZE = 32
NUM_EPOCHS = 30
LEARNING_RATE = 0.001
MAX_LENGTH = 10
HIDDEN_SIZE = 256

In [106]:
def load_dataset(easy_dir, hard_dir):

    image_paths = []
    words = []

    for directory in [easy_dir, hard_dir]:
        for filename in os.listdir(directory):
            if filename.endswith('.png'):

                image_paths.append(os.path.join(directory, filename))

                word = filename.split('_')[1].split('.')[0]
                words.append(word)

    return image_paths, words

In [107]:
!git clone https://github.com/adithyaac/Captcha-breaker.git

DATASET_PATH = './Captcha-breaker/captcha_dataset'
EASY_DIR = f'{DATASET_PATH}/easy'
HARD_DIR = f'{DATASET_PATH}/hard'



fatal: destination path 'Captcha-breaker' already exists and is not an empty directory.


In [108]:
class CAPTCHADataset(Dataset):

    def __init__(self, image_paths, words, transform=None):

        self.image_paths = image_paths
        self.words = words
        self.transform = transform

        all_chars = set(''.join(words))
        self.char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(all_chars))}
        self.char_to_idx['<PAD>'] = 0
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}

        print(f"Number of unique characters (including case): {len(all_chars)}")
        print("Character set:", sorted(all_chars))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):

        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)

        word = self.words[idx]
        word_indices = [self.char_to_idx[c] for c in word]
        word_indices = word_indices + [0] * (MAX_LENGTH - len(word_indices))

        return image, torch.tensor(word_indices), len(word)

In [109]:
class CAPTCHACnnRnn(nn.Module):
    def __init__(self, num_chars, hidden_size=HIDDEN_SIZE):

        super().__init__()

        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(64),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(128),
        )

        self.cnn_output_size = 128 * (IMAGE_SIZE[0] // 8) * (IMAGE_SIZE[1] // 8)

        self.reduce_dim = nn.Linear(self.cnn_output_size, hidden_size)

        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

        self.out = nn.Linear(hidden_size, num_chars)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.conv_layers(x)
        x = x.view(batch_size, -1)
        x = self.reduce_dim(x)

        x = x.unsqueeze(1).repeat(1, MAX_LENGTH, 1)

        lstm_out, _ = self.lstm(x)

        output = self.out(lstm_out)
        return output

In [110]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for images, labels, lengths in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            batch_size, seq_len, num_chars = outputs.size()
            loss = criterion(outputs.view(-1, num_chars), labels.view(-1))

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)

        model.eval()
        val_loss = 0.0
        correct_chars = 0
        total_chars = 0

        with torch.no_grad():
            for images, labels, lengths in val_loader:
                images = images.to(device)
                labels = labels.to(device)

                outputs = model(images)
                loss = criterion(outputs.view(-1, outputs.size(-1)),
                               labels.view(-1))

                val_loss += loss.item()

                _, predicted = outputs.max(2)
                for pred, label, length in zip(predicted, labels, lengths):
                    correct_chars += (pred[:length] == label[:length]).sum().item()
                    total_chars += length

        val_loss = val_loss / len(val_loader)
        val_losses.append(val_loss)
        accuracy = correct_chars / total_chars * 100

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {epoch_loss:.4f}')
        print(f'Val Loss: {val_loss:.4f}')
        print(f'Character Accuracy: {accuracy:.2f}%')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

    return train_losses, val_losses

In [111]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [112]:
image_paths, words = load_dataset(EASY_DIR, HARD_DIR)

train_paths, test_paths, train_words, test_words = train_test_split(
    image_paths, words, test_size=0.2, random_state=42
)
train_paths, val_paths, train_words, val_words = train_test_split(
    train_paths, train_words, test_size=0.2, random_state=42
)

In [113]:
transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])


In [114]:
train_dataset = CAPTCHADataset(train_paths, train_words, transform)
val_dataset = CAPTCHADataset(val_paths, val_words, transform)
test_dataset = CAPTCHADataset(test_paths, test_words, transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

Number of unique characters (including case): 46
Character set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
Number of unique characters (including case): 46
Character set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
Number of unique characters (including case): 46
Character set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']


In [115]:
num_chars = len(train_dataset.char_to_idx)
model = CAPTCHACnnRnn(num_chars).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

history = train_model(model, train_loader, val_loader, criterion,
                     optimizer, NUM_EPOCHS, device)

Epoch 1/30:
Train Loss: 2.7632
Val Loss: 2.4896
Character Accuracy: 22.86%
Epoch 2/30:
Train Loss: 2.3410
Val Loss: 2.2825
Character Accuracy: 26.40%
Epoch 3/30:
Train Loss: 2.1839
Val Loss: 2.1562
Character Accuracy: 30.08%
Epoch 4/30:
Train Loss: 2.0605
Val Loss: 2.0394
Character Accuracy: 32.17%
Epoch 5/30:
Train Loss: 1.9305
Val Loss: 1.9599
Character Accuracy: 33.93%
Epoch 6/30:
Train Loss: 1.7960
Val Loss: 1.8565
Character Accuracy: 37.29%
Epoch 7/30:
Train Loss: 1.6460
Val Loss: 1.7308
Character Accuracy: 41.34%
Epoch 8/30:
Train Loss: 1.4803
Val Loss: 1.5815
Character Accuracy: 47.02%
Epoch 9/30:
Train Loss: 1.2689
Val Loss: 1.3823
Character Accuracy: 55.13%
Epoch 10/30:
Train Loss: 1.0096
Val Loss: 1.1288
Character Accuracy: 64.74%
Epoch 11/30:
Train Loss: 0.7203
Val Loss: 0.9603
Character Accuracy: 70.20%
Epoch 12/30:
Train Loss: 0.4596
Val Loss: 0.7425
Character Accuracy: 78.67%
Epoch 13/30:
Train Loss: 0.2608
Val Loss: 0.6299
Character Accuracy: 82.23%
Epoch 14/30:
Train Lo

In [116]:
def compute_char_metrics(model, data_loader, device):

    correct_chars = 0
    total_chars = 0
    char_errors = []

    model.eval()
    with torch.no_grad():
        for images, labels, lengths in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = outputs.max(2)

            for pred, label, length in zip(predicted, labels, lengths):
                pred_chars = [train_dataset.idx_to_char[idx.item()] for idx in pred[:length]]
                true_chars = [train_dataset.idx_to_char[idx.item()] for idx in label[:length]]

                correct_chars += sum(p == t for p, t in zip(pred_chars, true_chars))
                total_chars += length

                if pred_chars != true_chars:
                    char_errors.append({
                        'predicted': ''.join(pred_chars),
                        'actual': ''.join(true_chars)
                    })

    accuracy = correct_chars / total_chars * 100
    return accuracy, char_errors

In [117]:
def compute_word_metrics(model, data_loader, device):

    correct_words = 0
    total_words = 0
    word_examples = []

    model.eval()
    with torch.no_grad():
        for images, labels, lengths in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = outputs.max(2)

            for pred, label, length in zip(predicted, labels, lengths):
                pred_word = ''.join([train_dataset.idx_to_char[idx.item()]
                                   for idx in pred[:length]])
                true_word = ''.join([train_dataset.idx_to_char[idx.item()]
                                   for idx in label[:length]])

                is_correct = pred_word == true_word
                if is_correct:
                    correct_words += 1

                word_examples.append({
                    'predicted': pred_word,
                    'actual': true_word,
                    'correct': is_correct
                })
                total_words += 1

    accuracy = correct_words / total_words * 100
    return accuracy, word_examples

In [119]:
def evaluate_model(model_path, test_loader, device):

    char_accuracy, char_errors = compute_char_metrics(model, test_loader, device)
    word_accuracy, word_examples = compute_word_metrics(model, test_loader, device)

    print(f"Character-level Accuracy: {char_accuracy:.2f}%")
    print(f"Word-level Accuracy: {word_accuracy:.2f}%")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluate_model('best_model.pth', test_loader, device)

Character-level Accuracy: 87.47%
Word-level Accuracy: 68.58%


In [122]:
def test_single_image(model, image_path, transform, device):
    model.eval()

    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(image)
        _, predicted = output.max(2)

        pred_word = ''.join([train_dataset.idx_to_char[idx.item()]
                            for idx in predicted[0]])

        print(f"Predicted word: {pred_word}")


test_single_image(model, "0_CHaNCe.png", transform, device)

Predicted word: CHaNCeeeee
