# Vision-Caption Projector Training

Train the projector on COCO Captions. Works on **Colab** and **Kaggle**.

**IMPORTANT:** Enable GPU + Internet first!
- **Colab:** Runtime → Change runtime type → T4 GPU
- **Kaggle:** Settings → Accelerator → GPU T4 x2 + Internet ON

**If Kaggle logs show `Accelerator: None`, GPU is not enabled.**
**If `git clone` fails with `Could not resolve host`, Internet is OFF.**

In [None]:
# Step 1: Clone and install (fails fast if Internet/GPU are off)
import os
import socket
import subprocess
import sys


def require_internet() -> None:
    try:
        socket.gethostbyname("github.com")
    except OSError as exc:
        raise SystemExit(
            "Internet is OFF. In Kaggle: Settings → Internet → ON."
        ) from exc


require_internet()

if not os.path.isdir("vision-caption"):
    subprocess.run(
        ["git", "clone", "https://github.com/asynced24/vision-caption.git"],
        check=True,
    )

os.chdir("vision-caption")

subprocess.run([sys.executable, "-m", "pip", "install", "-e", ".", "-q"], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "pycocotools", "-q"], check=True)

try:
    import torch

    if not torch.cuda.is_available():
        raise SystemExit(
            "GPU not detected. Enable Accelerator (Kaggle) or T4 GPU (Colab)."
        )
except Exception as exc:
    raise SystemExit(f"GPU check failed: {exc}") from exc

print("✓ Setup complete")

In [None]:
# Step 2: Download COCO dataset (~19GB)
from pathlib import Path


data_dir = Path("coco_data")
data_dir.mkdir(exist_ok=True)

if not (data_dir / "train2017").exists():
    print("Downloading images...")
    !wget -q http://images.cocodataset.org/zips/train2017.zip -P coco_data
    !unzip -q coco_data/train2017.zip -d coco_data
    !rm coco_data/train2017.zip

if not (data_dir / "annotations").exists():
    print("Downloading annotations...")
    !wget -q http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P coco_data
    !unzip -q coco_data/annotations_trainval2017.zip -d coco_data
    !rm coco_data/annotations_trainval2017.zip

if not (data_dir / "train2017").exists():
    raise SystemExit("train2017 not found. Check that Internet is ON.")

if not (data_dir / "annotations" / "captions_train2017.json").exists():
    raise SystemExit("captions_train2017.json not found. Check Internet is ON.")

print("✓ Dataset ready!")

In [None]:
# Step 3: Train projector (full COCO)
!python train.py --images-dir coco_data/train2017 --annotations-file coco_data/annotations/captions_train2017.json --output-dir checkpoints --epochs 3 --batch-size 32 --lr 1e-3

In [None]:
# Step 4: Test trained model
from vision_caption import ModelConfig, load_model
from PIL import Image
import requests
from io import BytesIO

config = ModelConfig()
config.projector_path = "checkpoints/projector_final.pt"
model = load_model(config)

# Test image
url = "https://images.unsplash.com/photo-1518791841217-8f162f1e1131"
image = Image.open(BytesIO(requests.get(url).content))

print(f"Caption: {model.generate(image)}")
display(image)

In [None]:
# Step 5: Download trained weights
try:
    from google.colab import files
    files.download('checkpoints/projector_final.pt')
    print("✓ Downloaded (Colab)")
except ImportError:
    print("✓ On Kaggle: Click Output tab → Download projector_final.pt")