<a href="https://colab.research.google.com/github/adithyaac/Captcha-breaker/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split

IMAGE_SIZE = (150, 50)
BATCH_SIZE = 32
NUM_EPOCHS = 50
LEARNING_RATE = 0.001
MAX_LENGTH = 10
HIDDEN_SIZE = 256

In [None]:
def load_dataset(easy_dir, hard_dir):

    image_paths = []
    words = []

    for directory in [easy_dir, hard_dir]:
        for filename in os.listdir(directory):
            if filename.endswith('.png'):

                image_paths.append(os.path.join(directory, filename))

                word = filename.split('_')[1].split('.')[0]
                words.append(word)

    return image_paths, words

In [None]:
!git clone https://github.com/adithyaac/Captcha-breaker.git

DATASET_PATH = './Captcha-breaker/captcha_dataset'
EASY_DIR = f'{DATASET_PATH}/easy'
HARD_DIR = f'{DATASET_PATH}/hard'



In [None]:
class CAPTCHADataset(Dataset):

    def __init__(self, image_paths, words, transform=None):

        self.image_paths = image_paths
        self.words = words
        self.transform = transform

        # Create character dictionaries including both cases
        all_chars = set(''.join(words))  # This will now include both upper and lower case
        self.char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(all_chars))}
        self.char_to_idx['<PAD>'] = 0  # Add padding token
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}

        print(f"Number of unique characters (including case): {len(all_chars)}")
        print("Character set:", sorted(all_chars))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Convert word to indices, preserving case
        word = self.words[idx]
        word_indices = [self.char_to_idx[c] for c in word]
        # Pad sequence
        word_indices = word_indices + [0] * (MAX_LENGTH - len(word_indices))

        return image, torch.tensor(word_indices), len(word)

In [None]:
class CAPTCHACnnRnn(nn.Module):
    def __init__(self, num_chars, hidden_size=HIDDEN_SIZE):

        super().__init__()

        # CNN Feature Extractor
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(32),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(64),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(128),
        )

        # Calculate CNN output size
        self.cnn_output_size = 128 * (IMAGE_SIZE[0] // 8) * (IMAGE_SIZE[1] // 8)

        # Linear layer to reduce CNN output size
        self.reduce_dim = nn.Linear(self.cnn_output_size, hidden_size)

        # LSTM layer
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

        # Output layer
        self.out = nn.Linear(hidden_size, num_chars)

    def forward(self, x):
        # CNN feature extraction
        batch_size = x.size(0)
        x = self.conv_layers(x)
        x = x.view(batch_size, -1)  # Flatten
        x = self.reduce_dim(x)

        # Expand for sequence length
        x = x.unsqueeze(1).repeat(1, MAX_LENGTH, 1)

        # LSTM
        lstm_out, _ = self.lstm(x)

        # Output layer
        output = self.out(lstm_out)
        return output

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0

        for images, labels, lengths in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            # Reshape outputs and labels for loss calculation
            batch_size, seq_len, num_chars = outputs.size()
            loss = criterion(outputs.view(-1, num_chars), labels.view(-1))

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct_chars = 0
        total_chars = 0

        with torch.no_grad():
            for images, labels, lengths in val_loader:
                images = images.to(device)
                labels = labels.to(device)

                outputs = model(images)
                loss = criterion(outputs.view(-1, outputs.size(-1)),
                               labels.view(-1))

                val_loss += loss.item()

                # Calculate character accuracy
                _, predicted = outputs.max(2)
                for pred, label, length in zip(predicted, labels, lengths):
                    correct_chars += (pred[:length] == label[:length]).sum().item()
                    total_chars += length

        val_loss = val_loss / len(val_loader)
        val_losses.append(val_loss)
        accuracy = correct_chars / total_chars * 100

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {epoch_loss:.4f}')
        print(f'Val Loss: {val_loss:.4f}')
        print(f'Character Accuracy: {accuracy:.2f}%')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth', weights_only=True)

    return train_losses, val_losses

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
image_paths, words = load_dataset(EASY_DIR, HARD_DIR)

train_paths, test_paths, train_words, test_words = train_test_split(
    image_paths, words, test_size=0.2, random_state=42
)
train_paths, val_paths, train_words, val_words = train_test_split(
    train_paths, train_words, test_size=0.2, random_state=42
)

In [None]:
transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225])
])


In [None]:
# Create datasets
train_dataset = CAPTCHADataset(train_paths, train_words, transform)
val_dataset = CAPTCHADataset(val_paths, val_words, transform)
test_dataset = CAPTCHADataset(test_paths, test_words, transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
# Initialize model
num_chars = len(train_dataset.char_to_idx)
model = CAPTCHACnnRnn(num_chars).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # ignore padding
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Train model
history = train_model(model, train_loader, val_loader, criterion,
                     optimizer, NUM_EPOCHS, device)