# Flickr8k Image Captioning Training (Colab Pro)

This notebook facilitates training the `mini-transformer` model on Google Colab.

### Professional Workflow:
1. **Mount Drive:** Saves trained model (`.pth`) and dataset zips persistently so we don't lose progress or have to re-download 1GB every time.
2. **Clone Repo:** Pulls latest code from GitHub into the Colab runtime.
3. **Hybrid Data Load:** Automatically detects if the dataset is on Google Drive; if not, it downloads it once and saves a copy.
4. **Install & Run:** Sets up the environment and starts the training script.

In [2]:
import os
from google.colab import drive
from pathlib import Path

# 1. Mount Google Drive (Essential for persistent storage)
# This will simply skip if already mounted.
drive.mount('/content/drive')

# 2. Setup Workspace (Align with GitHub repo structure)
# Ensure we are in the project folder
os.chdir('/content/ImageDescript')
print(f"Current working directory: {os.getcwd()}")

# 3. Create missing __init__.py files (Crucial for -m flag)
!touch src/__init__.py
!touch src/data/__init__.py
!touch src/model/__init__.py
!touch src/train/__init__.py

# 4. Setup Paths for Data Loading
GDRIVE_DATA_DIR = Path('/content/drive/MyDrive/trainingData')
LOCAL_DATA_DIR = Path('/content/ImageDescript/data')

# Create directories if they don't exist
GDRIVE_DATA_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

print("\nWorkspace setup and Drive mount complete.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Repository already exists. Pulling latest changes...
/content/ImageDescript
fatal: not a git repository (or any of the parent directories): .git


In [None]:
# Move into the repository
os.chdir('/content/ImageDescript')
print(f"Current working directory: {os.getcwd()}")

# 3. Ensure __init__.py files exist (Crucial for -m flag)
# This creates empty __init__.py files if they are missing in your repo
!touch src/__init__.py
!touch src/data/__init__.py
!touch src/model/__init__.py
!touch src/train/__init__.py

# 4. Dynamic Data Loading
GDRIVE_DATA_DIR = Path('/content/drive/MyDrive/trainingData')
LOCAL_DATA_DIR = Path('/content/ImageDescript/data')

GDRIVE_DATA_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

def sync_data():
    zips = ['Flickr8k_Dataset.zip', 'Flickr8k_Text.zip']
    for zip_name in zips:
        gdrive_path = GDRIVE_DATA_DIR / zip_name
        local_zip_path = Path('/content') / zip_name
        if gdrive_path.exists():
            print(f"Found {zip_name} on Google Drive. Copying to local SSD...")
            !cp "{gdrive_path}" "{local_zip_path}"
        else:
            print(f"{zip_name} not found on Drive. Downloading from source...")
            url = f"https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/{zip_name}"
            !wget -O "{local_zip_path}" "{url}"
            print(f"Saving {zip_name} to Google Drive for future use...")
            !cp "{local_zip_path}" "{gdrive_path}"
        print(f"Extracting {zip_name} to local runtime disk...")
        !unzip -q -o "{local_zip_path}" -d "{LOCAL_DATA_DIR}"
        !rm "{local_zip_path}"

    # Handle the known typo in the original image zip folder name
    typo_dir = LOCAL_DATA_DIR / 'Flicker8k_Dataset'
    correct_dir = LOCAL_DATA_DIR / 'Flickr8k_Dataset'
    if typo_dir.exists() and not correct_dir.exists():
        typo_dir.rename(correct_dir)

sync_data()

In [None]:
# 5. Install Dependencies
!pip install spacy tqdm pandas Pillow torch torchvision
!python -m spacy download en_core_web_sm

In [None]:
# 6. Start Training
# We add the current directory to PYTHONPATH to ensure 'src' is found
!PYTHONPATH=. python -m src.train.train

In [None]:
# 7. Save trained model back to Google Drive
if os.path.exists('captioning_model.pth'):
    !mkdir -p /content/drive/MyDrive/savedModels
    !cp captioning_model.pth /content/drive/MyDrive/savedModels/captioning_model.pth
    print("Model checkpoint successfully backed up to Google Drive at /savedModels/")