# Is reCAPTCHAv2 Safe? - TIMM Implementation

**Educational Research Project - UFABC Artificial Intelligence Course**

This notebook implements a deep learning classifier using **PyTorch Image Models (timm)** to analyze the viability of reCAPTCHAv2 as a CAPTCHA method.

## Model Options
You can experiment with different pre-trained models by changing `model_name` in CONFIG:
- `efficientnet_b0` - Efficient and fast (default)
- `resnet50` - Classic architecture
- `vit_small_patch16_224` - Vision Transformer
- `convnext_tiny` - Modern ConvNet
- `mobilenetv3_large_100` - Lightweight mobile model

## Features
- Transfer learning with timm pre-trained models
- Data augmentation for improved generalization
- Early stopping to prevent overfitting
- Learning rate scheduling (Cosine Annealing)
- MPS/CUDA/CPU support
- Training visualization and metrics

In [1]:
from utils.dataset_download import DatasetDownloader
from utils.load_datasets import load_datasets

dataset_downloader = DatasetDownloader("../datasets")
dataset_downloader.download_all()

df = load_datasets("../datasets")
display(df.head())
display(df['From'].value_counts())
display(f"Total images loaded: {len(df)}")

Dataset AdityaJain1030/recaptcha-dataset already exists. Skipping download.
Dataset nobodyPerfecZ/recaptchav2-29k already exists. Skipping download.
Dataset cry2003/google-recaptcha-v2-images already exists. Skipping download.
Dataset mikhailma/test-dataset already exists. Skipping download.


Unnamed: 0,Image,Filename,Label,From
0,../datasets/google-recaptcha-v2-images/images/...,0de9e212-2460-4b55-bf30-f4bd9c158c23.jpg,Hydrant,google-recaptcha-v2-images
1,../datasets/google-recaptcha-v2-images/images/...,5d15f820-c4b6-4403-8d21-2e4778072b03.jpg,Hydrant,google-recaptcha-v2-images
2,../datasets/google-recaptcha-v2-images/images/...,1e6a809a-8fc3-4961-9114-c996fdf0eb79.jpg,Hydrant,google-recaptcha-v2-images
3,../datasets/google-recaptcha-v2-images/images/...,Hydrant$bdac55d076bea3ba89bc9a1052331806.png,Hydrant,google-recaptcha-v2-images
4,../datasets/google-recaptcha-v2-images/images/...,77e34738-b086-4b45-bffe-f76032d90592.jpg,Hydrant,google-recaptcha-v2-images


From
google-recaptcha-v2-images    32265
recaptchav2-29k               29568
recaptcha-dataset             11774
test-dataset                    279
Name: count, dtype: int64

'Total images loaded: 73886'

In [2]:
from pathlib import Path
import os
import pandas as pd

# Configuration
max_images = 3000

labels = df['Label'].unique()

label_count = {label: 0 for label in labels}

updt_df_dict = {
    "Image": [],
    "Filename": [],
    "Label": [],
    "From": []
}

# Create shared images directory once (outside the loop)
shared_images_dir = Path("../timm_dataset_shared/images")
shared_images_dir.mkdir(parents=True, exist_ok=True)

# Create symlinks to all images once
print("Creating shared images directory...")
for idx, row in df.iterrows():
    if label_count[row['Label']] < max_images:
        src = Path(row['Image']).absolute()
        unique_filename = f"{row['From']}_{row['Filename']}"
        dst = shared_images_dir / unique_filename

        if not dst.exists():
            os.symlink(src, dst)
        label_count[row['Label']] += 1

        updt_df_dict["Image"].append(row["Image"])
        updt_df_dict["Filename"].append(row["Filename"])
        updt_df_dict["Label"].append(row["Label"])
        updt_df_dict["From"].append(row["From"])

print(f"Shared images directory created with {len(list(shared_images_dir.iterdir()))} images")

df = pd.DataFrame(updt_df_dict)
df.to_parquet("../datasets/datasets_reduced.parquet", index=False)

Creating shared images directory...
Shared images directory created with 26773 images


In [3]:
import pandas as pd
try:
    display(df['Label'].value_counts())
except NameError:
    df = pd.read_parquet('../datasets/datasets_reduced.parquet', engine='pyarrow')
    display(df['Label'].value_counts())

Label
Hydrant          3000
Car              3000
Traffic Light    3000
Other            3000
Bus              3000
Bicycle          3000
Crosswalk        3000
Palm             2580
Bridge           1831
Stair             644
Chimney           389
Motorcycle        297
Mountain           32
Name: count, dtype: int64

In [4]:
from pathlib import Path
import os
import shutil
from sklearn.model_selection import StratifiedKFold

shared_images_dir = Path("../timm_dataset_shared/images")

N_FOLDS = 5
# Setup k-fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
labels = df['Label'].unique()

# Store results from all folds
all_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['Label'])):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{N_FOLDS}")
    print(f"{'='*50}")

    # Split data for this fold
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)

    print(f"Train: {len(train_df)}, Val: {len(val_df)}")
    print(f"Train label distribution:\n{train_df['Label'].value_counts()}")
    print(f"Val label distribution:\n{val_df['Label'].value_counts()}")

    # Create fold-specific dataset structure
    dataset_root = Path(f"../timm_dataset_fold{fold}")
    labels_file = dataset_root / "labels.txt"

    dataset_root.mkdir(parents=True, exist_ok=True)

    # Build list of images with their splits for this fold
    all_images = []

    # Add training images
    for idx, row in train_df.iterrows():
        unique_filename = f"{row['From']}_{row['Filename']}"
        all_images.append({
            'filename': unique_filename,
            'label': row['Label'],
            'split': 'train'
        })

    # Add validation images
    for idx, row in val_df.iterrows():
        unique_filename = f"{row['From']}_{row['Filename']}"
        all_images.append({
            'filename': unique_filename,
            'label': row['Label'],
            'split': 'val'
        })

    # Create labels file with format: filename label split
    with open(labels_file, 'w') as f:
        f.write("filename\tlabel\tsplit\n")
        for img_info in all_images:
            f.write(f"{img_info['filename']}\t{img_info['label']}\t{img_info['split']}\n")

    # For TIMM classification, create train/val split directories
    # with symlinks pointing to the shared images folder
    train_dir = dataset_root / "train"
    val_dir = dataset_root / "val"

    # Remove old train/val if they exist
    if train_dir.exists():
        shutil.rmtree(train_dir)
    if val_dir.exists():
        shutil.rmtree(val_dir)

    # Create label subdirectories
    for label in labels:
        (train_dir / label).mkdir(parents=True, exist_ok=True)
        (val_dir / label).mkdir(parents=True, exist_ok=True)

    # Create symlinks in train/val pointing to shared images folder
    for img_info in all_images:
        src = (shared_images_dir / img_info['filename']).absolute()
        if img_info['split'] == 'train':
            dst = train_dir / img_info['label'] / img_info['filename']
        else:
            dst = val_dir / img_info['label'] / img_info['filename']

        os.symlink(src, dst)

    print(f"Fold {fold + 1} dataset created!")
    print(f"  - Labels file: {labels_file}")


FOLD 1/5
Train: 21418, Val: 5355
Train label distribution:
Label
Hydrant          2400
Car              2400
Traffic Light    2400
Other            2400
Bus              2400
Bicycle          2400
Crosswalk        2400
Palm             2064
Bridge           1464
Stair             516
Chimney           311
Motorcycle        237
Mountain           26
Name: count, dtype: int64
Val label distribution:
Label
Hydrant          600
Car              600
Traffic Light    600
Other            600
Bus              600
Bicycle          600
Crosswalk        600
Palm             516
Bridge           367
Stair            128
Chimney           78
Motorcycle        60
Mountain           6
Name: count, dtype: int64
Fold 1 dataset created!
  - Labels file: ../timm_dataset_fold0/labels.txt

FOLD 2/5
Train: 21418, Val: 5355
Train label distribution:
Label
Hydrant          2400
Car              2400
Traffic Light    2400
Other            2400
Bus              2400
Bicycle          2400
Crosswalk        2400

In [5]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import timm
from timm.loss import LabelSmoothingCrossEntropy
import os
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd

# Configuration
CONFIG = {
    'model_name': 'efficientnet_b0',
    'img_size': 224,
    'batch_size': 32,
    'epochs': 100,
    'patience': 50,
    'lr': 0.001,
    'device': 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu',
    'num_workers': 4,
    'n_folds': 5,  # ADICIONAR
    'save_dir': 'is_recaptchav2_safe/timm_experiment'
}

print(f"Using device: {CONFIG['device']}")
print(f"Model: {CONFIG['model_name']}")

# Create save directory
Path(CONFIG['save_dir']).mkdir(parents=True, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps
Model: efficientnet_b0


In [None]:
# Store results from all folds
all_fold_results = []


# Training and validation functions (definir FORA do loop)
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(train_loader, desc='Training')
    for inputs, labels in pbar:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        pbar.set_postfix({
            'loss': f'{running_loss / len(pbar):.4f}',
            'acc': f'{100. * correct / total:.2f}%'
        })

    return running_loss / len(train_loader), 100. * correct / total


def validate(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        pbar = tqdm(val_loader, desc='Validation')
        for inputs, labels in pbar:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            pbar.set_postfix({
                'loss': f'{running_loss / len(pbar):.4f}',
                'acc': f'{100. * correct / total:.2f}%'
            })

    return running_loss / len(val_loader), 100. * correct / total


# INÍCIO DO LOOP DE FOLDS
for fold in range(CONFIG['n_folds']):
    print(f"\n{'=' * 60}")
    print(f"PROCESSING FOLD {fold + 1}/{CONFIG['n_folds']}")
    print(f"{'=' * 60}")

    # Update data directory for current fold
    fold_data_dir = f'../timm_dataset_fold{fold}'
    fold_save_dir = os.path.join(CONFIG['save_dir'], f'fold{fold}')
    Path(fold_save_dir).mkdir(parents=True, exist_ok=True)

    # Data Transforms
    train_transform = transforms.Compose([
        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    val_transform = transforms.Compose([
        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Load datasets for this fold
    train_dataset = datasets.ImageFolder(
        root=os.path.join(fold_data_dir, 'train'),
        transform=train_transform
    )

    val_dataset = datasets.ImageFolder(
        root=os.path.join(fold_data_dir, 'val'),
        transform=val_transform
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        num_workers=CONFIG['num_workers'],
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers=CONFIG['num_workers'],
        pin_memory=True
    )

    num_classes = len(train_dataset.classes)
    class_names = train_dataset.classes

    print(f"\nDataset Statistics:")
    print(f"Number of classes: {num_classes}")
    print(f"Classes: {class_names}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")

    # Create model using timm (NOVO modelo para cada fold)
    model = timm.create_model(
        CONFIG['model_name'],
        pretrained=True,
        num_classes=num_classes
    )

    model = model.to(CONFIG['device'])

    # Loss function and optimizer
    criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=0.01)

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer,
        T_0=10,
        T_mult=2,
        eta_min=1e-6
    )

    print(f"\nModel created: {CONFIG['model_name']}")
    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # Training loop with early stopping
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': [],
        'lr': []
    }

    best_val_acc = 0.0
    patience_counter = 0
    best_model_path = os.path.join(fold_save_dir, 'best.pt')
    last_model_path = os.path.join(fold_save_dir, 'last.pt')

    print("\n" + "=" * 50)
    print(f"Starting Training - Fold {fold + 1}")
    print("=" * 50)

    for epoch in range(CONFIG['epochs']):
        print(f"\nEpoch {epoch + 1}/{CONFIG['epochs']}")
        print("-" * 50)

        # Train
        train_loss, train_acc = train_epoch(
            model, train_loader, criterion, optimizer, CONFIG['device']
        )

        # Validate
        val_loss, val_acc = validate(
            model, val_loader, criterion, CONFIG['device']
        )

        # Update learning rate
        scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']

        # Save history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['lr'].append(current_lr)

        print(f"\nEpoch Summary:")
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
        print(f"Learning Rate: {current_lr:.6f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({
                'epoch': epoch,
                'fold': fold,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
                'val_loss': val_loss,
                'class_names': class_names
            }, best_model_path)
            print(f"✓ New best model saved! Val Acc: {val_acc:.2f}%")
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"Patience: {patience_counter}/{CONFIG['patience']}")

        # Save last model
        torch.save({
            'epoch': epoch,
            'fold': fold,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_acc': val_acc,
            'val_loss': val_loss,
            'class_names': class_names
        }, last_model_path)

        # Early stopping
        if patience_counter >= CONFIG['patience']:
            print(f"\nEarly stopping triggered after {epoch + 1} epochs")
            break

    print("\n" + "=" * 50)
    print(f"Fold {fold + 1} Training Completed!")
    print("=" * 50)
    print(f"Best validation accuracy: {best_val_acc:.2f}%")
    print(f"Best model saved at: {best_model_path}")

    # Save fold results
    fold_results = {
        'fold': fold + 1,
        'best_val_acc': best_val_acc,
        'final_val_acc': val_acc,
        'final_train_acc': train_acc,
        'final_val_loss': val_loss,
        'final_train_loss': train_loss,
        'epochs_trained': epoch + 1
    }
    all_fold_results.append(fold_results)

    # Save fold history
    df_history = pd.DataFrame(history)
    df_history.to_csv(os.path.join(fold_save_dir, 'training_history.csv'), index=False)
    print(f"Training history saved to: {os.path.join(fold_save_dir, 'training_history.csv')}")

    # Plot training curves for this fold
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Plot loss
    axes[0].plot(history['train_loss'], label='Train Loss', marker='o')
    axes[0].plot(history['val_loss'], label='Val Loss', marker='s')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title(f'Fold {fold + 1} - Training and Validation Loss')
    axes[0].legend()
    axes[0].grid(True)

    # Plot accuracy
    axes[1].plot(history['train_acc'], label='Train Acc', marker='o')
    axes[1].plot(history['val_acc'], label='Val Acc', marker='s')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy (%)')
    axes[1].set_title(f'Fold {fold + 1} - Training and Validation Accuracy')
    axes[1].legend()
    axes[1].grid(True)

    # Plot learning rate
    axes[2].plot(history['lr'], label='Learning Rate', marker='o', color='green')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('Learning Rate')
    axes[2].set_title(f'Fold {fold + 1} - Learning Rate Schedule')
    axes[2].set_yscale('log')
    axes[2].legend()
    axes[2].grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(fold_save_dir, 'training_plots.png'), dpi=300, bbox_inches='tight')
    plt.show()

    print(f"Training plots saved to: {os.path.join(fold_save_dir, 'training_plots.png')}")

# FIM DO LOOP DE FOLDS


PROCESSING FOLD 1/5

Dataset Statistics:
Number of classes: 13
Classes: ['Bicycle', 'Bridge', 'Bus', 'Car', 'Chimney', 'Crosswalk', 'Hydrant', 'Motorcycle', 'Mountain', 'Other', 'Palm', 'Stair', 'Traffic Light']
Training samples: 21418
Validation samples: 5355

Model created: efficientnet_b0
Total parameters: 4,024,201
Trainable parameters: 4,024,201

Starting Training - Fold 1

Epoch 1/100
--------------------------------------------------


Training:  11%|█         | 71/670 [00:41<04:20,  2.30it/s, loss=0.1746, acc=64.13%] 

In [None]:
# Consolidate results from all folds
results_df = pd.DataFrame(all_fold_results)

print("\n" + "=" * 70)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("=" * 70)
print(results_df.to_string(index=False))

print(f"\n{'=' * 70}")
print("STATISTICS")
print("=" * 70)
print(f"Mean Best Val Accuracy: {results_df['best_val_acc'].mean():.2f}% ± {results_df['best_val_acc'].std():.2f}%")
print(f"Mean Final Val Accuracy: {results_df['final_val_acc'].mean():.2f}% ± {results_df['final_val_acc'].std():.2f}%")
print(
    f"Mean Final Train Accuracy: {results_df['final_train_acc'].mean():.2f}% ± {results_df['final_train_acc'].std():.2f}%")
print(f"Mean Epochs Trained: {results_df['epochs_trained'].mean():.1f} ± {results_df['epochs_trained'].std():.1f}")

# Save consolidated results
results_df.to_csv(os.path.join(CONFIG['save_dir'], 'cross_validation_results.csv'), index=False)
print(f"\nResults saved to: {os.path.join(CONFIG['save_dir'], 'cross_validation_results.csv')}")

# Plot comparison across folds
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Best validation accuracy per fold
axes[0].bar(results_df['fold'], results_df['best_val_acc'])
axes[0].axhline(y=results_df['best_val_acc'].mean(), color='r', linestyle='--', label='Mean')
axes[0].set_xlabel('Fold')
axes[0].set_ylabel('Best Validation Accuracy (%)')
axes[0].set_title('Best Validation Accuracy per Fold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Epochs trained per fold
axes[1].bar(results_df['fold'], results_df['epochs_trained'])
axes[1].axhline(y=results_df['epochs_trained'].mean(), color='r', linestyle='--', label='Mean')
axes[1].set_xlabel('Fold')
axes[1].set_ylabel('Epochs Trained')
axes[1].set_title('Epochs Trained per Fold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(CONFIG['save_dir'], 'cross_validation_comparison.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"Comparison plots saved to: {os.path.join(CONFIG['save_dir'], 'cross_validation_comparison.png')}")