In [8]:
# Check Python version
!python3 --version

# Upgrade pip
!pip install --upgrade pip

# Install required libraries
!pip install torch torchvision torchaudio --quiet
!pip install numpy pillow matplotlib --quiet

Python 3.11.13
Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2


In [68]:
from pathlib import Path
import csv

DATA_ROOT = Path("data")  # <-- change if needed
ALPHABET  = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"  # used if building from YOLO labels

PARTS     = ["part2", "part3", "part4"]       # adjust if you only want a subset
SPLITS    = ["train", "val", "test"]          # we’ll skip missing ones gracefully

In [69]:
def read_yolo_text_label_file(txt_path: Path, alphabet: str) -> str:
    """Reconstructs the captcha string from a YOLO .txt (class x y w h ...) by sorting on x-center."""
    if not txt_path.exists():
        return ""
    items = []
    with txt_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            if len(parts) < 5:
                continue
            try:
                cls = int(float(parts[0]))
                x_center = float(parts[1])
            except Exception:
                continue
            if 0 <= cls < len(alphabet):
                items.append((x_center, cls))
    items.sort(key=lambda t: t[0])
    return "".join(alphabet[cls] for _, cls in items)

In [70]:
def ensure_annotations_csv(part_dir: Path, split: str, alphabet: str):
    split_dir = part_dir / split
    img_dir   = split_dir / "images"
    lab_dir   = split_dir / "labels"      

    if not img_dir.exists():
        print(f"[SKIP] {img_dir} does not exist.")
        return

    out_csv  = split_dir / "annotations.csv"
    rows = []

    has_labels = lab_dir.exists()
    if has_labels:
        # Build from YOLO txts
        for img_path in sorted(img_dir.iterdir()):
            if img_path.suffix.lower() not in {".png", ".jpg", ".jpeg"}:
                continue
            txt_path = lab_dir / f"{img_path.stem}.txt"
            label = read_yolo_text_label_file(txt_path, alphabet)
            rows.append((img_path.name, label))
    else:
        # No labels folder — create empty labels
        for img_path in sorted(img_dir.iterdir()):
            if img_path.suffix.lower() not in {".png", ".jpg", ".jpeg"}:
                continue
            rows.append((img_path.name, ""))

        if split != "test":
            print(f"[WARN] {split_dir} has no labels/ folder. Wrote empty labels for '{split}'. "
                  f"Train/val will only work if you switch your training to --label_mode yolo "
                  f"and actually provide labels, or you add a proper annotations.csv later.")

    # write CSV (filename,label) without pandas
    out_csv.parent.mkdir(parents=True, exist_ok=True)
    with out_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["filename", "label"])
        w.writerows(rows)

    print(f"[OK] Wrote {out_csv} ({len(rows)} rows)  | labels_found={has_labels}")

In [None]:
for p in PARTS:
    part_dir = DATA_ROOT / p
    if not part_dir.exists():
        print(f"[SKIP] {part_dir} not found.")
        continue
    for split in SPLITS:
        ensure_annotations_csv(part_dir, split, ALPHABET)

In [25]:
!python train.py \
  --data_root data/part2 \ 
  --annotations annotations.csv \
  --label_mode csv \
  --img_h 64 --img_w 320 \
  --batch_size 128 --epochs 20 \
  --lr 3e-4 --scheduler cosine \
  --rnn_hidden 320 \
  --use_external_aug \
  --aug_strength mild \
  --rotation_mode resize

[INFO] Using predefined train/val/test splits
[CHECK] train: images=60000  label_rows=60000  matched=60000
[CHECK] label histogram (top 10 of 270279): [('0', 17502), ('1', 13080), ('2', 11354), ('3', 9910), ('4', 9070), ('5', 8700), ('6', 8238), ('7', 8016), ('8', 7633), ('9', 7502)]
[CHECK] val: images=20000  label_rows=20000  matched=20000
[CHECK] label histogram (top 10 of 90223): [('0', 5791), ('1', 4405), ('2', 3716), ('3', 3326), ('4', 3056), ('5', 2953), ('6', 2850), ('7', 2596), ('8', 2476), ('9', 2434)]
[INFO] time_steps=10  max_label_len=7
[INFO] Starting training for 20 epochs on cuda
[DEBUG] batch shapes: images (128, 1, 64, 320) targets_len 563 sum(target_lengths) 563
[E001] train_loss=8.1566  val_loss=8.1568  acc=0.0000  cer=1.1363  time=284.5s  lr=0.000150
[PEEK] sample predictions:
  gt='6NXW'  pred='Y1'
  gt='OO2'  pred='Y1Y'
  gt='06ADQ2V'  pred='Y1'
  gt='23B'  pred='Y'
  gt='HR521'  pred='Y1Y1Y'
[INFO] Saved best checkpoint to /root/captcha_solver/models/vgg16_lstm_

In [55]:
!python error_analysis.py \
  --data_root data/part2 \
  --ckpt vgg16_lstm_ctc_epoch020.pth \
  --split val \
  --batch_size 128 \
  --num_workers 2 \
  --device auto \
  --out_prefix . 



=== Overall ===
split: val
val_loss: 0.497411
seq_accuracy: 0.683600
cer_mean: 0.093740
num_samples: 20000

=== Top 15 most error-prone characters (by error_rate) ===
char  errors  support  error_rate  subs_from_char  deletions_of_char  correct
   O     939     2101    0.446930             925                 14     1162
   Q     605     2097    0.288507             586                 19     1492
   D     437     2407    0.181554             418                 19     1970
   R     223     2081    0.107160             213                 10     1858
   7     277     2596    0.106703             250                 27     2319
   B     256     2405    0.106445             239                 17     2149
   T     211     2134    0.098875             193                 18     1923
   1     426     4405    0.096708             383                 43     3979
   G     201     2247    0.089453             187                 14     2046
   A     189     2424    0.077970             174   

In [56]:
!python visualize_predictions.py \
  --data_root data/part2 \
  --ckpt /kaggle/working/vgg16_lstm_ctc_epoch020.pth \
  --split test \
  --num_samples 64 \
  --cols 8 \
  --out_image test_preds_grid.png \
  --out_csv test_preds.csv \
  --device auto


Saved image grid to: test_preds_grid.png
Saved predictions CSV to: test_preds.csv


In [57]:
!python export_test_predictions.py \
  --data_root data/part2 \
  --ckpt /vgg16_lstm_ctc_epoch020.pth \
  --out_json test_predictions.json \
  --batch_size 128 \
  --device auto


[OK] Wrote 20000 entries to test_predictions.json


In [64]:
!python export_test_predictions.py \
  --data_root data/part3 \
  --ckpt /vgg16_lstm_ctc_epoch020.pth \
  --out_json test_predictions_part3.json \
  --batch_size 128 \
  --device auto


[OK] Wrote 20000 entries to test_predictions_part3.json


In [65]:
!python export_test_predictions.py \
  --data_root data/part4 \
  --ckpt /vgg16_lstm_ctc_epoch020.pth \
  --out_json test_predictions_part4.json \
  --batch_size 128 \
  --device auto

[OK] Wrote 20000 entries to test_predictions_part4.json
