<a href="https://colab.research.google.com/github/billvo2212/reproducing-research-paper-result/blob/main/RSITMD_training_eval_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RSITMD Training & Evaluation for RSGPT

This notebook trains and evaluates RSGPT on the RSITMD dataset.

**Key Fixes from Previous Version:**
1. Fixed `iters_per_epoch` (was 10000, now ~830 based on dataset size)
2. Fixed evaluation imports to avoid circular import errors
3. Added proper kernel restart handling for evaluation
4. Fixed METEOR calculation
5. Added checkpoint resume capability

# STEP 1: SETUP AND INSTALL DEPENDENCIES

In [None]:
# @title Step 1.1: Install Dependencies
# Core stack (CUDA 12.1 wheels work well on Colab A100/L4/T4)
!pip -q install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --extra-index-url https://download.pytorch.org/whl/cu121
!pip -q install transformers==4.41.2 accelerate==0.31.0 peft==0.11.1 bitsandbytes==0.43.1 sentencepiece
!pip -q install decord==0.6.0 datasets==2.20.0 rouge-score==0.1.2 pycocoevalcap

# Java (for METEOR evaluation)
!apt-get -yqq update && apt-get -yqq install openjdk-17-jre-headless

# Install additional evaluation dependencies
!pip install -q nltk

# Install iopath and other potentially missing packages
!pip install -q iopath omegaconf timm einops

# Install additional packages that RSGPT might need
!pip install -q webdataset braceexpand

!apt-get update -qq
!apt-get install -y default-jdk -qq

import os
os.environ['_JAVA_OPTIONS'] = '-Xmx8g'

# Download NLTK data for METEOR
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Login to Hugging Face if your base LLM requires it
from huggingface_hub import notebook_login
notebook_login()  # or set HF_TOKEN env var

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.9/780.9 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m153.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m128.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m115.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m143.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# @title Step 1.2: Mount Google Drive and Setup Paths
from google.colab import drive
drive.mount('/content/drive')

# ============================================
# CONFIGURE YOUR PATHS HERE
# ============================================
PROJECT_ROOT = "/content/drive/MyDrive/data/RSGPT"       # RSGPT repo path
DATA_ROOT = "/content/drive/MyDrive/data/rsgpt"          # General data root
OUT_ROOT = "/content/drive/MyDrive/outputs/rsgpt"        # Output directory

# RSITMD specific paths
RSITMD_BASE = f"{PROJECT_ROOT}/dataset/RSITMD"           # RSITMD dataset location

# Create directories
!mkdir -p {RSITMD_BASE}/images
!mkdir -p {OUT_ROOT}/rsitmd_finetuned

print(f"✓ Directories configured:")
print(f"  Project: {PROJECT_ROOT}")
print(f"  RSITMD Dataset: {RSITMD_BASE}")
print(f"  Output: {OUT_ROOT}")

Mounted at /content/drive
✓ Directories configured:
  Project: /content/drive/MyDrive/data/RSGPT
  RSITMD Dataset: /content/drive/MyDrive/data/RSGPT/dataset/RSITMD
  Output: /content/drive/MyDrive/outputs/rsgpt


# STEP 2: DOWNLOAD AND PREPARE RSITMD DATASET

In [None]:
# @title Step 2.1: Check RSITMD Dataset
import os
import json

print("="*60)
print("Step 2.1: Check RSITMD Dataset")
print("="*60)

# Check for annotation file
ann_file = f"{RSITMD_BASE}/dataset_RSITMD.json"
if os.path.exists(ann_file):
    print(f"✓ Found annotation file: {ann_file}")
    with open(ann_file) as f:
        data = json.load(f)
    print(f"  Total images: {len(data.get('images', []))}")
else:
    print(f"❌ Annotation file not found: {ann_file}")
    print("\nPlease download RSITMD dataset and place dataset_RSITMD.json in:")
    print(f"  {RSITMD_BASE}/")

# Check for images
img_dir = f"{RSITMD_BASE}/images"
if os.path.exists(img_dir):
    img_files = [f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.png', '.tif'))]
    print(f"✓ Found {len(img_files)} images in {img_dir}")
else:
    print(f"❌ Images directory not found: {img_dir}")

# Show sample annotation
if os.path.exists(ann_file):
    print("\nSample annotation entry:")
    print(json.dumps(data['images'][0], indent=2)[:500] + "...")

Step 2.1: Check RSITMD Dataset
✓ Found annotation file: /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/dataset_RSITMD.json
  Total images: 4743
✓ Found 4744 images in /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/images

Sample annotation entry:
{
  "filename": "baseballfield_452.tif",
  "imgid": 0,
  "sentences": [
    {
      "tokens": [
        "there",
        "is",
        "a",
        "baseball",
        "field",
        "beside",
        "the",
        "green",
        "amusement",
        "park",
        "around",
        "the",
        "red",
        "track"
      ],
      "raw": "There is a baseball field beside the green amusement park around the red track.",
      "imgid": 0,
      "sentid": 0
    },
    {
      "tokens": [
...


# STEP 3: CREATE TRAIN/VAL/TEST SPLITS

In [None]:
# @title Step 3.1: Create Train/Val/Test Splits
import json
import random
import os
from collections import defaultdict

print("="*60)
print("Step 3.1: Create Train/Val/Test Splits")
print("="*60)

# Load annotations
with open(f"{RSITMD_BASE}/dataset_RSITMD.json", 'r') as f:
    rsitmd_data = json.load(f)

# Parse RSITMD format
print("Parsing RSITMD format...")

image_captions = {}
train_images = []
val_images = []
test_images = []

for img in rsitmd_data['images']:
    filename = img['filename']

    # Extract captions from sentences
    captions = [sent['raw'] for sent in img['sentences']]
    image_captions[filename] = captions

    # Use the existing split field
    split = img.get('split', 'train')
    if split == 'train':
        train_images.append(filename)
    elif split == 'val':
        val_images.append(filename)
    elif split == 'test':
        test_images.append(filename)
    else:
        train_images.append(filename)  # Default to train

# Statistics
total_images = len(image_captions)
total_captions = sum(len(caps) for caps in image_captions.values())

print(f"\n✓ Parsed successfully!")
print(f"Total unique images: {total_images}")
print(f"Total captions: {total_captions}")
print(f"Average captions per image: {total_captions / total_images:.1f}")

# Split sizes
print(f"\nSplit sizes (from dataset):")
print(f"  Train: {len(train_images)} images ({100*len(train_images)/total_images:.1f}%)")
print(f"  Val:   {len(val_images)} images ({100*len(val_images)/total_images:.1f}%)")
print(f"  Test:  {len(test_images)} images ({100*len(test_images)/total_images:.1f}%)")

# If no val/test splits exist, create them
if len(val_images) == 0 or len(test_images) == 0:
    print("\n⚠️ No predefined val/test splits found. Creating 70/15/15 split...")
    all_images = list(image_captions.keys())
    random.seed(42)
    random.shuffle(all_images)

    n_total = len(all_images)
    n_train = int(0.70 * n_total)
    n_val = int(0.15 * n_total)

    train_images = all_images[:n_train]
    val_images = all_images[n_train:n_train + n_val]
    test_images = all_images[n_train + n_val:]

    print(f"  Train: {len(train_images)} images")
    print(f"  Val:   {len(val_images)} images")
    print(f"  Test:  {len(test_images)} images")

# Store for later use
RSITMD_SPLITS = {
    'train': train_images,
    'val': val_images,
    'test': test_images,
    'image_captions': image_captions
}

# Calculate average caption length
all_caption_words = sum(len(cap.split()) for caps in image_captions.values() for cap in caps)
avg_caption_len = all_caption_words / total_captions
print(f"\nAverage caption length: {avg_caption_len:.1f} words")

print("\n✓ RSITMD_SPLITS created!")

Step 3.1: Create Train/Val/Test Splits
Parsing RSITMD format...

✓ Parsed successfully!
Total unique images: 4743
Total captions: 23715
Average captions per image: 5.0

Split sizes (from dataset):
  Train: 4291 images (90.5%)
  Val:   0 images (0.0%)
  Test:  452 images (9.5%)

⚠️ No predefined val/test splits found. Creating 70/15/15 split...
  Train: 3320 images
  Val:   711 images
  Test:  712 images

Average caption length: 10.4 words

✓ RSITMD_SPLITS created!


In [None]:
# @title Step 3.2: Create RSGPT Instruction Format
import json
import random

print("="*60)
print("Step 3.2: Create RSGPT Instruction Format")
print("="*60)

def create_instruction_format(image_list, image_captions, split_name):
    """
    Create RSGPT instruction format.
    """
    annotations = []

    # Standard prompts for captioning
    prompts = [
        "Describe the content of the image.",
        "What is shown in this remote sensing image?",
        "Briefly describe the content of the image.",
        "Provide a description of this aerial image.",
        "What can you see in this satellite image?"
    ]

    for idx, filename in enumerate(image_list):
        captions = image_captions.get(filename, [])
        if not captions:
            continue

        annotations.append({
            "image_id": idx,
            "filename": filename,
            "text_input": prompts[idx % len(prompts)],
            "text_output": captions
        })

    return {"annotations": annotations}

# Create instruction files for each split
for split_name in ['train', 'val', 'test']:
    image_list = RSITMD_SPLITS[split_name]
    data = create_instruction_format(image_list, RSITMD_SPLITS['image_captions'], split_name)

    output_path = f"{RSITMD_BASE}/rsitmd_cap_processed_instruction_{split_name}.json"
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)

    print(f"✓ Created {split_name}: {len(data['annotations'])} samples")

# Show sample
print("\nSample annotation:")
print(json.dumps(data['annotations'][0], indent=2))

Step 3.2: Create RSGPT Instruction Format
✓ Created train: 3320 samples
✓ Created val: 711 samples
✓ Created test: 712 samples

Sample annotation:
{
  "image_id": 0,
  "filename": "storagetanks_4442.tif",
  "text_input": "Describe the content of the image.",
  "text_output": [
    "There is a lot of grass on the ground.",
    "There is a lot of grass on the ground.",
    "There is a lot of grass on the ground.",
    "There is a lot of grass on the ground.",
    "There is a lot of grass on the ground."
  ]
}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Step 3.3: Create COCO-format GT for Evaluation
import json

print("="*60)
print("Step 3.3: Create COCO-format GT for Evaluation")
print("="*60)

def create_coco_gt(image_list, image_captions, output_path):
    """
    Create COCO-format ground truth for pycocoevalcap evaluation.
    """
    coco_format = {
        "info": {"description": "RSITMD", "version": "1.0"},
        "licenses": [],
        "type": "captions",
        "images": [],
        "annotations": []
    }

    ann_id = 0
    for img_id, filename in enumerate(image_list):
        captions = image_captions.get(filename, [])
        if not captions:
            continue

        # Add image
        coco_format["images"].append({
            "id": img_id,
            "file_name": filename
        })

        # Add all captions for this image
        for caption in captions:
            coco_format["annotations"].append({
                "id": ann_id,
                "image_id": img_id,
                "caption": caption
            })
            ann_id += 1

    with open(output_path, 'w') as f:
        json.dump(coco_format, f, indent=2)

    return len(coco_format['images']), len(coco_format['annotations'])

# Create GT files for val and test
for split_name in ['val', 'test']:
    output_path = f"{RSITMD_BASE}/rsitmd_{split_name}_gt.json"
    n_images, n_anns = create_coco_gt(
        RSITMD_SPLITS[split_name],
        RSITMD_SPLITS['image_captions'],
        output_path
    )
    print(f"✓ Created {split_name} GT: {n_images} images, {n_anns} captions")

print("\n" + "="*60)
print("Dataset Preparation Complete!")
print("="*60)
print("\nCreated files:")
!ls -la {RSITMD_BASE}/*.json

Step 3.3: Create COCO-format GT for Evaluation
✓ Created val GT: 711 images, 3555 captions
✓ Created test GT: 712 images, 3560 captions

Dataset Preparation Complete!

Created files:
-rw------- 1 root root 5678376 Dec 19 01:10 /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/dataset_RSITMD.json
-rw------- 1 root root  361855 Feb  2 00:11 /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/rsitmd_cap_processed_instruction_test.json
-rw------- 1 root root 1687719 Feb  2 00:11 /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/rsitmd_cap_processed_instruction_train.json
-rw------- 1 root root  362740 Feb  2 00:11 /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/rsitmd_cap_processed_instruction_val.json
-rw------- 1 root root  509417 Feb  2 00:11 /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/rsitmd_test_gt.json
-rw------- 1 root root  510095 Feb  2 00:11 /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/rsitmd_val_gt.json


# STEP 4: REGISTER DATASET AND CREATE CONFIGS

In [None]:
# @title Step 4.1: Register RSITMD Dataset Builder
import os

print("="*60)
print("Step 4.1: Register RSITMD Dataset Builder")
print("="*60)

# Create Dataset Builder
rsitmd_builder_code = '''
import os
import json
import logging

from rsgpt.datasets.builders.base_dataset_builder import BaseDatasetBuilder
from rsgpt.datasets.datasets.rsicd_instruction_dataset import RSICDInstructionDataset
from rsgpt.common.registry import registry

@registry.register_builder("rsitmd_instruction")
class RSITMDInstructionBuilder(BaseDatasetBuilder):
    train_dataset_cls = RSICDInstructionDataset
    eval_dataset_cls = RSICDInstructionDataset

    DATASET_CONFIG_DICT = {
        "default": "configs/datasets/rsitmd/defaults.yaml",
    }

    def build_datasets(self):
        logging.info("Building RSITMD Instruction datasets...")
        self.build_processors()

        build_info = self.config.build_info
        storage_path = build_info.storage

        datasets = dict()

        # Load train split
        train_ann_path = os.path.join(storage_path, "rsitmd_cap_processed_instruction_train.json")
        if os.path.exists(train_ann_path):
            datasets["train"] = self.train_dataset_cls(
                vis_processor=self.vis_processors["train"],
                text_processor=self.text_processors["train"],
                ann_paths=[train_ann_path],
                vis_root=os.path.join(storage_path, "images"),
            )
            logging.info(f"Loaded RSITMD train dataset: {len(datasets['train'])} samples")

        return datasets
'''

builder_path = f"{PROJECT_ROOT}/rsgpt/datasets/builders/rsitmd_builder.py"
os.makedirs(os.path.dirname(builder_path), exist_ok=True)
with open(builder_path, 'w') as f:
    f.write(rsitmd_builder_code)
print(f"✓ Created dataset builder: {builder_path}")

# Add import to __init__.py
init_path = f"{PROJECT_ROOT}/rsgpt/datasets/builders/__init__.py"
if os.path.exists(init_path):
    with open(init_path, 'r') as f:
        content = f.read()
    if 'rsitmd_builder' not in content:
        with open(init_path, 'a') as f:
            f.write("\nfrom rsgpt.datasets.builders.rsitmd_builder import RSITMDInstructionBuilder\n")
        print("✓ Added import to __init__.py")
    else:
        print("✓ Import already exists in __init__.py")

# Create Dataset Config
config_dir = f"{PROJECT_ROOT}/rsgpt/configs/datasets/rsitmd"
os.makedirs(config_dir, exist_ok=True)

rsitmd_dataset_config = '''datasets:
  rsitmd_instruction:
    data_type: images
    vis_processor:
      train:
        name: "rs_image_train"
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
    build_info:
      storage: dataset/RSITMD/
'''

config_path = f"{config_dir}/defaults.yaml"
with open(config_path, 'w') as f:
    f.write(rsitmd_dataset_config)
print(f"✓ Created dataset config: {config_path}")

print("\n✓ Step 4.1 Complete!")

Step 4.1: Register RSITMD Dataset Builder
✓ Created dataset builder: /content/drive/MyDrive/data/RSGPT/rsgpt/datasets/builders/rsitmd_builder.py
✓ Import already exists in __init__.py
✓ Created dataset config: /content/drive/MyDrive/data/RSGPT/rsgpt/configs/datasets/rsitmd/defaults.yaml

✓ Step 4.1 Complete!


In [None]:
# @title Step 4.2: Create Training Config (with pre-train from RSICap)
import os

print("="*60)
print("Step 4.2: Create RSITMD Training Config")
print("="*60)

# ============================================
# CALCULATE PROPER iters_per_epoch
# ============================================
# RSITMD has ~3320 training images
# With batch_size=4 and accum_grad_iters=4 (effective batch 16)
# Real iterations per epoch = 3320 / 4 = 830
# We can set it to 400-500 for faster training with multiple passes
TRAIN_SIZE = len(RSITMD_SPLITS['train'])
BATCH_SIZE = 4
ITERS_PER_EPOCH = max(200, TRAIN_SIZE // BATCH_SIZE)  # ~830 for RSITMD

print(f"Dataset size: {TRAIN_SIZE} images")
print(f"Batch size: {BATCH_SIZE}")
print(f"Calculated iters_per_epoch: {ITERS_PER_EPOCH}")

# ============================================
# USE RSICap PRETRAINED CHECKPOINT (same as RSICD pipeline!)
# ============================================
PRETRAINED_CHECKPOINT = "/content/drive/MyDrive/outputs/rsgpt/rsicap_split_checkpoint/20251229133/checkpoint_8.pth"

rsitmd_train_config = f'''model:
  arch: rsgpt
  model_type: vicuna13b
  freeze_vit: True
  freeze_qformer: False
  max_txt_len: 160
  llm_model: lmsys/vicuna-13b-v1.5
  end_sym: "###"
  prompt_path: "prompts/alignment.txt"

datasets:
  rsitmd_instruction:
    vis_processor:
      train:
        name: "rs_image_train"
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
    build_info:
      storage: dataset/RSITMD/

run:
  task: image_text_pretrain
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 5e-6
  min_lr: 1e-7
  warmup_lr: 1e-8
  warmup_steps: 200
  weight_decay: 0.05

  max_epoch: 30
  iters_per_epoch: {ITERS_PER_EPOCH}
  batch_size_train: {BATCH_SIZE}
  batch_size_eval: 4
  accum_grad_iters: 4              # Effective batch size = 4*4 = 16
  num_workers: 4

  seed: 42
  output_dir: "{OUT_ROOT}/rsitmd_finetuned_v2"

  amp: True
  resume_ckpt_path: "{PRETRAINED_CHECKPOINT}"

  evaluate: False
  train_splits: ["train"]

  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True
'''

# Save config
config_dir = f"{PROJECT_ROOT}/train_configs"
os.makedirs(config_dir, exist_ok=True)
config_path = f"{config_dir}/rsitmd_train.yaml"

with open(config_path, 'w') as f:
    f.write(rsitmd_train_config)

print(f"\n✓ Created training config: {config_path}")
print(f"\nTraining settings:")
print(f"  - Epochs: 15")
print(f"  - iters_per_epoch: {ITERS_PER_EPOCH} (FIXED!)")
print(f"  - Learning rate: 1e-5")
print(f"  - Batch size: {BATCH_SIZE} (x4 accumulation = effective 16)")
print(f"  - Estimated training time: ~2-3 hours on A100")

Step 4.2: Create RSITMD Training Config
Dataset size: 3320 images
Batch size: 4
Calculated iters_per_epoch: 830

✓ Created training config: /content/drive/MyDrive/data/RSGPT/train_configs/rsitmd_train.yaml

Training settings:
  - Epochs: 15
  - iters_per_epoch: 830 (FIXED!)
  - Learning rate: 1e-5
  - Batch size: 4 (x4 accumulation = effective 16)
  - Estimated training time: ~2-3 hours on A100


In [None]:
# @title Step 4.3: Create Evaluation Config
import os

print("="*60)
print("Step 4.3: Create RSITMD Evaluation Config")
print("="*60)

rsitmd_eval_config = '''model:
  arch: rsgpt
  model_type: vicuna13b
  max_txt_len: 160
  llm_model: lmsys/vicuna-13b-v1.5
  end_sym: "###"

datasets:
  rsitmd_instruction:
    vis_processor:
      train:
        name: "rs_image_train"
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"

run:
  task: image_text_pretrain
'''

eval_config_path = f"{PROJECT_ROOT}/eval_configs/rsitmd_eval.yaml"
os.makedirs(os.path.dirname(eval_config_path), exist_ok=True)
with open(eval_config_path, 'w') as f:
    f.write(rsitmd_eval_config)

print(f"✓ Created eval config: {eval_config_path}")

Step 4.3: Create RSITMD Evaluation Config
✓ Created eval config: /content/drive/MyDrive/data/RSGPT/eval_configs/rsitmd_eval.yaml


# STEP 5: TRAIN THE MODEL

In [None]:
# @title Step 5.1: Start Training
import torch
import gc

print("="*60)
print("Step 5.1: Start Training")
print("="*60)

# Clear GPU memory
print("\n[1] Clearing GPU memory...")
torch.cuda.empty_cache()
gc.collect()
print("  ✓ GPU memory cleared")

# Check GPU
print("\n[2] Checking GPU...")
!nvidia-smi

# Change to RSGPT directory
%cd {PROJECT_ROOT}

# Start training
print("\n[3] Starting RSITMD training...")
print(f"  Estimated time: ~2-3 hours on A100")
print("="*60)

!torchrun --nproc_per_node=1 train.py --cfg-path train_configs/rsitmd_train.yaml

Step 5.1: Start Training

[1] Clearing GPU memory...
  ✓ GPU memory cleared

[2] Checking GPU...
Mon Feb  2 00:13:39 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   36C    P0             55W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+

In [None]:
# @title Step 5.2: (Optional) Resume Training from Checkpoint
# Use this if training was interrupted

import os
import glob

print("="*60)
print("Step 5.2: Resume Training (Optional)")
print("="*60)

# Find latest checkpoint
checkpoint_dir = f"{OUT_ROOT}/rsitmd_finetuned"
ckpt_pattern = f"{checkpoint_dir}/*/checkpoint_*.pth"
ckpts = sorted(glob.glob(ckpt_pattern))

if ckpts:
    latest_ckpt = ckpts[-1]
    print(f"Found {len(ckpts)} checkpoints")
    print(f"Latest: {latest_ckpt}")

    # Create resume config
    resume_config = f'''model:
  arch: rsgpt
  model_type: vicuna13b
  freeze_vit: True
  freeze_qformer: False
  max_txt_len: 160
  llm_model: lmsys/vicuna-13b-v1.5
  end_sym: "###"
  prompt_path: "prompts/alignment.txt"

datasets:
  rsitmd_instruction:
    vis_processor:
      train:
        name: "rs_image_train"
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"
    build_info:
      storage: dataset/RSITMD/

run:
  task: image_text_pretrain
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 1e-5
  min_lr: 1e-6
  warmup_lr: 1e-7
  warmup_steps: 200
  weight_decay: 0.05

  max_epoch: 15
  iters_per_epoch: {ITERS_PER_EPOCH}
  batch_size_train: 4
  batch_size_eval: 4
  accum_grad_iters: 4
  num_workers: 4

  seed: 42
  output_dir: "{OUT_ROOT}/rsitmd_finetuned"

  amp: True
  resume_ckpt_path: "{latest_ckpt}"

  evaluate: False
  train_splits: ["train"]

  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True
'''
    resume_config_path = f"{PROJECT_ROOT}/train_configs/rsitmd_resume.yaml"
    with open(resume_config_path, 'w') as f:
        f.write(resume_config)

    print(f"\nCreated resume config: {resume_config_path}")
    print("\nTo resume, uncomment and run the command below:")
    print(f"# !torchrun --nproc_per_node=1 train.py --cfg-path train_configs/rsitmd_resume.yaml")
else:
    print("No checkpoints found. Start fresh training with Step 5.1")

# STEP 6: EVALUATE THE MODEL

**IMPORTANT:** If you see import errors, restart the runtime first:
- Click Runtime → Restart runtime
- Then run the evaluation cell directly (skip dependency installation)

In [None]:
# @title Step 6.0 QUICK TEST: Full Pipeline with 10 Images
# ============================================
# Tests EVERYTHING on just 10 images (~2-3 minutes)
# - Path checks
# - Model loading
# - Caption generation
# - Metric computation (BLEU, ROUGE-L, CIDEr)
# ============================================

import os
import sys
import json
import argparse
from PIL import Image

# ============================================
# CONFIGURATION
# ============================================
PROJECT_ROOT = "/content/drive/MyDrive/data/RSGPT"
RSITMD_BASE = f"{PROJECT_ROOT}/dataset/RSITMD"

# CORRECT checkpoint folder
CHECKPOINT_DIR = "/content/drive/MyDrive/outputs/rsgpt/rsitmd_finetuned/20260131202"
CHECKPOINT_PATH = f"{CHECKPOINT_DIR}/checkpoint_14.pth"
EVAL_CONFIG = "eval_configs/rsitmd_eval.yaml"

# Only test 10 images
NUM_TEST_IMAGES = 10

# ============================================
# Setup
# ============================================
print("="*60)
print("QUICK TEST - Full Pipeline with 10 Images")
print("="*60)

from google.colab import drive
if not os.path.exists('/content/drive/MyDrive'):
    drive.mount('/content/drive')

os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)

# Check paths exist
print("\n[1] Checking paths...")
checks = [
    (PROJECT_ROOT, "Project root"),
    (RSITMD_BASE, "RSITMD dataset"),
    (f"{RSITMD_BASE}/images", "Images folder"),
    (CHECKPOINT_DIR, "Checkpoint folder"),
    (CHECKPOINT_PATH, "Checkpoint file"),
    (EVAL_CONFIG, "Eval config"),
]

all_ok = True
for path, name in checks:
    exists = os.path.exists(path)
    status = "✓" if exists else "❌"
    print(f"  {status} {name}: {path}")
    if not exists:
        all_ok = False

if not all_ok:
    print("\n❌ Some paths missing! Fix before continuing.")
    raise FileNotFoundError("Missing required files")

print("\n  ✓ All paths OK!")

# ============================================
# Load model
# ============================================
print("\n[2] Loading model (this takes ~1 min)...")

import torch
from rsgpt.common.config import Config
from rsgpt.common.registry import registry
from rsgpt.datasets.builders import *
from rsgpt.models import *
from rsgpt.processors import *

cfg = Config(argparse.Namespace(cfg_path=EVAL_CONFIG, options=None))
model_cls = registry.get_model_class(cfg.model_cfg.arch)
model = model_cls.from_config(cfg.model_cfg).cuda()

try:
    vis_processor_cfg = cfg.datasets_cfg.rsitmd_instruction.vis_processor.train
    vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
except:
    from torchvision import transforms
    vis_processor = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                           std=[0.26862954, 0.26130258, 0.27577711])
    ])

print("  ✓ Model loaded")

# ============================================
# Load checkpoint
# ============================================
print("\n[3] Loading checkpoint weights...")
checkpoint = torch.load(CHECKPOINT_PATH, map_location="cuda")
model.load_state_dict(checkpoint["model"], strict=False)
model.eval()
print(f"  ✓ Loaded epoch {checkpoint.get('epoch', '?')}")

# ============================================
# Load test data and COCO GT
# ============================================
print(f"\n[4] Loading test data...")

from pycocotools.coco import COCO

test_file = f"{RSITMD_BASE}/rsitmd_cap_processed_instruction_test.json"
with open(test_file) as f:
    all_test_data = json.load(f)["annotations"]

# Take first N images
test_data = all_test_data[:NUM_TEST_IMAGES]
print(f"  ✓ Using {len(test_data)} of {len(all_test_data)} test images")

# Load COCO GT
coco = COCO(f"{RSITMD_BASE}/rsitmd_test_gt.json")
print("  ✓ COCO GT loaded")

# ============================================
# Generate captions
# ============================================
print(f"\n[5] Generating captions...")

STOP_TOKEN = "###"
def clean_caption(caption):
    if STOP_TOKEN in caption:
        caption = caption.split(STOP_TOKEN)[0]
    return caption.strip()

preds = []
garbage_count = 0
garbage_patterns = ['```', '<script>', '<img>', '<video>', '<audio>', 'Aerogado', '19780']

with torch.no_grad():
    for idx, item in enumerate(test_data):
        img_path = f"{RSITMD_BASE}/images/{item['filename']}"
        image = vis_processor(Image.open(img_path).convert("RGB")).unsqueeze(0).cuda()

        caption = model.generate(
            {"image": image, "prompt": "Briefly describe the content of the image."},
            use_nucleus_sampling=False,
            num_beams=5,
            max_length=80,
            min_length=8,
            repetition_penalty=1.2,
            length_penalty=1.0,
            num_captions=1,
        )[0]

        caption = clean_caption(caption)

        # Check for garbage
        is_garbage = any(p in caption for p in garbage_patterns) or len(caption.strip()) == 0
        if is_garbage:
            garbage_count += 1

        status = "❌" if is_garbage else "✓"
        print(f"  [{idx+1}/{NUM_TEST_IMAGES}] {status} {caption[:50]}...")

        preds.append({
            "image_id": int(item["image_id"]),
            "caption": caption
        })

print(f"\n  ✓ Generated {len(preds)} captions")
print(f"  Valid: {NUM_TEST_IMAGES - garbage_count}/{NUM_TEST_IMAGES}")
print(f"  Garbage: {garbage_count}/{NUM_TEST_IMAGES}")

# ============================================
# Compute Metrics
# ============================================
print("\n[6] Computing metrics (BLEU, ROUGE-L, CIDEr)...")

from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

# Prepare data for scorers
gts = {}
res = {}
for pred in preds:
    img_id = pred["image_id"]
    gts[img_id] = [ann["caption"] for ann in coco.imgToAnns.get(img_id, [])]
    res[img_id] = [pred["caption"]]

# Show what we're computing
print(f"\n  Ground truth refs per image: {len(list(gts.values())[0])}")
print(f"  Computing on {len(gts)} images...")

# Scorers (no METEOR - it hangs)
scorers = [
    (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    (Rouge(), "ROUGE_L"),
    (Cider(), "CIDEr"),
]

metrics = {}
for scorer, method in scorers:
    try:
        method_name = method if isinstance(method, str) else method[0]
        print(f"  Computing {method_name}...", end=" ")
        score, _ = scorer.compute_score(gts, res)
        if isinstance(method, list):
            for m, s in zip(method, score):
                metrics[m] = s
        else:
            metrics[method] = score
        print("✓")
    except Exception as e:
        print(f"❌ Error: {e}")
        if isinstance(method, list):
            for m in method:
                metrics[m] = 0.0
        else:
            metrics[method] = 0.0

# ============================================
# Results
# ============================================
print("\n" + "="*60)
print("QUICK TEST RESULTS")
print("="*60)

print(f"\nCaption Quality:")
print(f"  Valid captions: {NUM_TEST_IMAGES - garbage_count}/{NUM_TEST_IMAGES}")
print(f"  Garbage outputs: {garbage_count}/{NUM_TEST_IMAGES}")

print(f"\n" + "-"*40)
print("METRICS (on 10 images):")
print("-"*40)
print(f"  BLEU-1:  {metrics.get('Bleu_1', 0):.4f}")
print(f"  BLEU-2:  {metrics.get('Bleu_2', 0):.4f}")
print(f"  BLEU-3:  {metrics.get('Bleu_3', 0):.4f}")
print(f"  BLEU-4:  {metrics.get('Bleu_4', 0):.4f}")
print(f"  ROUGE-L: {metrics.get('ROUGE_L', 0):.4f}")
print(f"  CIDEr:   {metrics.get('CIDEr', 0):.4f}")
print("-"*40)

# Verdict
print("\n" + "="*60)
print("VERDICT")
print("="*60)

all_metrics_ok = all(v > 0 for v in metrics.values())
captions_ok = garbage_count == 0

if captions_ok and all_metrics_ok:
    print("""
✅ ALL TESTS PASSED!

  ✓ Model loads correctly
  ✓ Captions are valid English
  ✓ All metrics compute successfully

  → Run the FULL evaluation: rsitmd_eval_FINAL.py
""")
elif captions_ok and not all_metrics_ok:
    print("""
⚠️ CAPTIONS OK, BUT METRIC ISSUES

  ✓ Captions are valid
  ❌ Some metrics failed to compute

  → Check error messages above
""")
elif not captions_ok and all_metrics_ok:
    print("""
⚠️ SOME GARBAGE CAPTIONS

  ❌ Some outputs are garbage
  ✓ Metrics can compute

  → Check if you're using the right checkpoint
""")
else:
    print("""
❌ MULTIPLE ISSUES

  ❌ Garbage captions detected
  ❌ Metric computation problems

  → Do NOT proceed to full evaluation
  → Fix issues first
""")

print("="*60)
print("\nSample Outputs:")
print("="*60)
for idx, (item, pred) in enumerate(zip(test_data[:5], preds[:5])):
    gt = item["text_output"][0] if item["text_output"] else "N/A"
    print(f"\n[{idx}] Image: {item['filename']}")
    print(f"    Generated: {pred['caption']}")
    print(f"    GT:        {gt}")

QUICK TEST - Full Pipeline with 10 Images

[1] Checking paths...
  ✓ Project root: /content/drive/MyDrive/data/RSGPT
  ✓ RSITMD dataset: /content/drive/MyDrive/data/RSGPT/dataset/RSITMD
  ✓ Images folder: /content/drive/MyDrive/data/RSGPT/dataset/RSITMD/images
  ✓ Checkpoint folder: /content/drive/MyDrive/outputs/rsgpt/rsitmd_finetuned/20260131202
  ✓ Checkpoint file: /content/drive/MyDrive/outputs/rsgpt/rsitmd_finetuned/20260131202/checkpoint_14.pth
  ✓ Eval config: eval_configs/rsitmd_eval.yaml

  ✓ All paths OK!

[2] Loading model (this takes ~1 min)...
Loading VIT
Loading VIT Done
Loading Q-Former




Loading Q-Former Done
Loading LLAMA


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Loading LLAMA Done
  📥 Loading pretrained weights from: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth
  ✓ Model loaded

[3] Loading checkpoint weights...
  ✓ Loaded epoch 14

[4] Loading test data...
  ✓ Using 10 of 712 test images
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
  ✓ COCO GT loaded

[5] Generating captions...




  [1/10] ✓ There are three tanks of different sizes near the ...
  [2/10] ✓ There is a piece of land in the middle of the dese...
  [3/10] ✓ There is a lot of sand in the desert....
  [4/10] ✓ There are many cars on the bridge....
  [5/10] ✓ There are many green trees around the football fie...
  [6/10] ✓ There are many buildings around the lake....
  [7/10] ✓ the airport is surrounded by many buildings and a ...
  [8/10] ✓ There are many green trees on both sides of the ro...
  [9/10] ✓ The church is located in the center of the square....
  [10/10] ✓ There are many factories in the industrial area....

  ✓ Generated 10 captions
  Valid: 10/10
  Garbage: 0/10

[6] Computing metrics (BLEU, ROUGE-L, CIDEr)...

  Ground truth refs per image: 5
  Computing on 10 images...
  Computing Bleu_1... {'testlen': 94, 'reflen': 103, 'guess': [94, 84, 74, 64], 'correct': [47, 16, 5, 1]}
ratio: 0.9126213592144405
✓
  Computing ROUGE_L... ✓
  Computing CIDEr... ✓

QUICK TEST RESULTS

Caption Quality:

In [None]:
# @title Step 6.1: Evaluate ALL RSITMD v2 Checkpoints (Find Best Epoch)
# ============================================
# Evaluates every checkpoint, picks best by CIDEr
# Paste this as a new cell after restarting runtime
# ============================================

import os, sys, json, glob, argparse
from PIL import Image

# ============================================
# ⬇️ UPDATE THIS PATH with your actual folder name!
#    Run first:  !ls /content/drive/MyDrive/outputs/rsgpt/rsitmd_finetuned_v2/
#    Then replace REPLACE_WITH_FOLDER_NAME below
# ============================================
PROJECT_ROOT = "/content/drive/MyDrive/data/RSGPT"
RSITMD_BASE  = f"{PROJECT_ROOT}/dataset/RSITMD"
CHECKPOINT_DIR = "/content/drive/MyDrive/outputs/rsgpt/rsitmd_finetuned_v2/20260202001"
EVAL_CONFIG  = "eval_configs/rsitmd_eval.yaml"

# ============================================
# Setup
# ============================================
from google.colab import drive
if not os.path.exists('/content/drive/MyDrive'):
    drive.mount('/content/drive')

os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)

import torch
from rsgpt.common.config import Config
from rsgpt.common.registry import registry
from rsgpt.datasets.builders import *
from rsgpt.models import *
from rsgpt.processors import *

from pycocotools.coco import COCO
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

print("=" * 60)
print("RSITMD v2 - Evaluate ALL Checkpoints")
print("=" * 60)

STOP_TOKEN = "###"
def clean_caption(caption):
    if STOP_TOKEN in caption:
        caption = caption.split(STOP_TOKEN)[0]
    return caption.strip()

# [1] Find checkpoints
print("\n[1] Finding checkpoints...")
ckpts = sorted(glob.glob(f"{CHECKPOINT_DIR}/checkpoint_*.pth"))
print(f"  Found {len(ckpts)} checkpoints:")
for c in ckpts:
    print(f"    - {os.path.basename(c)}")
if not ckpts:
    raise FileNotFoundError(f"No checkpoints in {CHECKPOINT_DIR}")

# [2] Load test data
print("\n[2] Loading test data...")
with open(f"{RSITMD_BASE}/rsitmd_cap_processed_instruction_test.json") as f:
    test_data = json.load(f)["annotations"]
print(f"  ✓ {len(test_data)} test images")

total_words = sum(len(cap.split()) for item in test_data for cap in item["text_output"])
total_caps = sum(len(item["text_output"]) for item in test_data)
avg_gt_len = total_words / total_caps

coco = COCO(f"{RSITMD_BASE}/rsitmd_test_gt.json")
print("  ✓ COCO GT loaded")

# [3] Load model (once)
print("\n[3] Loading model...")
cfg = Config(argparse.Namespace(cfg_path=EVAL_CONFIG, options=None))
model_cls = registry.get_model_class(cfg.model_cfg.arch)
model = model_cls.from_config(cfg.model_cfg).cuda()

try:
    vp_cfg = cfg.datasets_cfg.rsitmd_instruction.vis_processor.train
    vis_processor = registry.get_processor_class(vp_cfg.name).from_config(vp_cfg)
except:
    from torchvision import transforms
    vis_processor = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                           std=[0.26862954, 0.26130258, 0.27577711])
    ])
print("  ✓ Model ready")

# [4] Evaluate each checkpoint
EVAL_PROMPT = "Briefly describe the content of the image."
best = {"ckpt": None, "CIDEr": -1.0, "metrics": None, "epoch": None}
all_results = []

for ckpt_idx, ckpt in enumerate(ckpts):
    print(f"\n{'='*60}")
    print(f"[{ckpt_idx+1}/{len(ckpts)}] {os.path.basename(ckpt)}")
    print("=" * 60)

    checkpoint = torch.load(ckpt, map_location="cuda")
    model.load_state_dict(checkpoint["model"], strict=False)
    model.eval()
    epoch = checkpoint.get('epoch', ckpt_idx)

    preds = []
    with torch.no_grad():
        for img_idx, item in enumerate(test_data):
            if (img_idx + 1) % 200 == 0 or img_idx == 0:
                print(f"    {img_idx+1}/{len(test_data)}")

            img_path = f"{RSITMD_BASE}/images/{item['filename']}"
            image = vis_processor(Image.open(img_path).convert("RGB")).unsqueeze(0).cuda()

            caption = model.generate(
                {"image": image, "prompt": item.get("text_input", EVAL_PROMPT)},
                use_nucleus_sampling=False,
                num_beams=5,
                max_length=80,
                min_length=8,
                repetition_penalty=1.2,
                length_penalty=1.0,
                num_captions=1,
            )[0]
            preds.append({"image_id": int(item["image_id"]), "caption": clean_caption(caption)})

    avg_len = sum(len(p["caption"].split()) for p in preds) / len(preds)

    # Compute metrics
    gts, res = {}, {}
    for pred in preds:
        gts[pred["image_id"]] = [a["caption"] for a in coco.imgToAnns.get(pred["image_id"], [])]
        res[pred["image_id"]] = [pred["caption"]]

    metrics = {}
    for scorer, method in [(Bleu(4), ["Bleu_1","Bleu_2","Bleu_3","Bleu_4"]), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]:
        try:
            score, _ = scorer.compute_score(gts, res)
            if isinstance(method, list):
                for m, s in zip(method, score): metrics[m] = s
            else:
                metrics[method] = score
        except Exception as e:
            print(f"    ⚠️ {method}: {e}")

    cider = metrics.get("CIDEr", 0.0)
    print(f"\n  Epoch {epoch}: B1={metrics.get('Bleu_1',0):.4f}  B4={metrics.get('Bleu_4',0):.4f}  "
          f"R={metrics.get('ROUGE_L',0):.4f}  CIDEr={cider:.4f}  ({avg_len:.0f} words)")

    all_results.append({"checkpoint": ckpt, "epoch": epoch, "avg_len": avg_len, "metrics": metrics})

    if cider > best["CIDEr"]:
        best = {"ckpt": ckpt, "CIDEr": cider, "metrics": metrics, "epoch": epoch}
        print(f"  >>> ★ NEW BEST!")

    # Show samples from first checkpoint
    if ckpt_idx == 0:
        for i in range(min(2, len(preds))):
            gt = [a["caption"] for a in coco.imgToAnns.get(preds[i]["image_id"], [])]
            print(f"    Gen: {preds[i]['caption'][:80]}")
            print(f"    GT:  {gt[0][:80] if gt else 'N/A'}")

# ============================================
# [5] Summary Table
# ============================================
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\n{'Epoch':<8} {'BLEU-1':>8} {'BLEU-4':>8} {'ROUGE-L':>8} {'CIDEr':>8}")
print("-" * 44)
for r in all_results:
    m = r["metrics"]
    star = " ★" if r["checkpoint"] == best["ckpt"] else ""
    print(f"  {r['epoch']:<6} {m.get('Bleu_1',0):>8.4f} {m.get('Bleu_4',0):>8.4f} "
          f"{m.get('ROUGE_L',0):>8.4f} {m.get('CIDEr',0):>8.4f}{star}")

print(f"\n★ BEST: Epoch {best['epoch']} → CIDEr = {best['CIDEr']:.4f}")
print(f"  Path: {best['ckpt']}")

# Compare v2 vs v1 vs RSICD
print("\n" + "-" * 60)
print("Comparison:  v2 (with RSICap)  vs  v1 (without)  vs  RSICD")
print("-" * 60)
v1 = {"Bleu_1": 0.4471, "Bleu_4": 0.1016, "ROUGE_L": 0.3856, "CIDEr": 0.3694}
rc = {"Bleu_1": 0.6759, "Bleu_4": 0.3040, "ROUGE_L": 0.5296, "CIDEr": 0.8917}
print(f"{'Metric':<10} {'v2 (now)':>10} {'v1 (old)':>10} {'RSICD':>10} {'v2-v1':>10}")
print("-" * 52)
for m in ["Bleu_1", "Bleu_4", "ROUGE_L", "CIDEr"]:
    print(f"{m:<10} {best['metrics'].get(m,0):>10.4f} {v1[m]:>10.4f} {rc[m]:>10.4f} {best['metrics'].get(m,0)-v1[m]:>+10.4f}")

# Save
results_path = f"{CHECKPOINT_DIR}/eval_all_checkpoints.json"
with open(results_path, "w") as f:
    json.dump({"best": {"ckpt": best["ckpt"], "epoch": best["epoch"], "metrics": best["metrics"]}, "all": all_results}, f, indent=2)
print(f"\n✓ Saved to: {results_path}")
print("\n✅ DONE!")



RSITMD v2 - Evaluate ALL Checkpoints

[1] Finding checkpoints...
  Found 21 checkpoints:
    - checkpoint_10.pth
    - checkpoint_11.pth
    - checkpoint_12.pth
    - checkpoint_13.pth
    - checkpoint_14.pth
    - checkpoint_15.pth
    - checkpoint_16.pth
    - checkpoint_17.pth
    - checkpoint_18.pth
    - checkpoint_19.pth
    - checkpoint_20.pth
    - checkpoint_21.pth
    - checkpoint_22.pth
    - checkpoint_23.pth
    - checkpoint_24.pth
    - checkpoint_25.pth
    - checkpoint_26.pth
    - checkpoint_27.pth
    - checkpoint_28.pth
    - checkpoint_29.pth
    - checkpoint_9.pth

[2] Loading test data...
  ✓ 712 test images
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
  ✓ COCO GT loaded

[3] Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading VIT
Loading VIT Done
Loading Q-Former




Loading Q-Former Done
Loading LLAMA


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Loading LLAMA Done
  📥 Loading pretrained weights from: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth
  ✓ Model ready

[1/21] checkpoint_10.pth
    1/712




    200/712
    400/712
    600/712
{'testlen': 6121, 'reflen': 6492, 'guess': [6121, 5409, 4697, 3985], 'correct': [2767, 878, 279, 80]}
ratio: 0.9428527418359607

  Epoch 10: B1=0.4255  B4=0.0910  R=0.3102  CIDEr=0.3021  (9 words)
  >>> ★ NEW BEST!
    Gen: There is a large tank in the middle of the field.
    GT:  There is a lot of grass on the ground.
    Gen: There is a black line in the middle of the field.
    GT:  Do you have a bicycle print?

[2/21] checkpoint_11.pth
    1/712
    200/712
    400/712
    600/712
{'testlen': 6075, 'reflen': 6465, 'guess': [6075, 5363, 4651, 3939], 'correct': [2796, 927, 306, 91]}
ratio: 0.9396751740137756

  Epoch 11: B1=0.4316  B4=0.0983  R=0.3122  CIDEr=0.3178  (9 words)
  >>> ★ NEW BEST!

[3/21] checkpoint_12.pth
    1/712
    200/712
    400/712
    600/712
{'testlen': 5982, 'reflen': 6404, 'guess': [5982, 5270, 4558, 3846], 'correct': [2789, 955, 325, 95]}
ratio: 0.9341036851966061

  Epoch 12: B1=0.4345  B4=0.1029  R=0.3177  CIDEr=0.3548 

# STEP 7: COMPARE RESULTS

In [None]:
# @title Step 7.1: Compare with RSICD Results
import json

print("="*60)
print("Step 7.1: Compare RSITMD vs RSICD Results")
print("="*60)

# Load RSITMD results
rsitmd_results_path = f"{OUT_ROOT}/rsitmd_finetuned/evaluation_results.json"
try:
    with open(rsitmd_results_path) as f:
        rsitmd_results = json.load(f)
    rsitmd_metrics = rsitmd_results["best"]["metrics"]
    print("✓ Loaded RSITMD results")
except:
    print("⚠️ RSITMD results not found. Run evaluation first.")
    rsitmd_metrics = None

# Your RSICD results from earlier experiments
rsicd_metrics = {
    "Bleu_1": 0.6759,
    "Bleu_2": 0.4948,
    "Bleu_3": 0.3792,
    "Bleu_4": 0.3040,
    "METEOR": 0.2500,
    "ROUGE_L": 0.5296,
    "CIDEr": 0.8917
}

# Compare
if rsitmd_metrics:
    print("\n" + "="*60)
    print("RESULTS COMPARISON: RSITMD vs RSICD")
    print("="*60)
    print(f"\n{'Metric':<12} {'RSITMD':>12} {'RSICD':>12} {'Difference':>12}")
    print("-"*50)

    for metric in ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr"]:
        rsitmd_val = rsitmd_metrics.get(metric, 0)
        rsicd_val = rsicd_metrics.get(metric, 0)
        diff = rsitmd_val - rsicd_val

        if diff > 0.01:
            status = "↑"
        elif diff < -0.01:
            status = "↓"
        else:
            status = "≈"

        print(f"{metric:<12} {rsitmd_val:>12.4f} {rsicd_val:>12.4f} {diff:>+12.4f} {status}")

    print("-"*50)
    print("\n↑ = RSITMD better | ↓ = RSICD better | ≈ = Similar")
else:
    print("\nRun evaluation first to see comparison.")

In [None]:
# @title Step 7.2: Summary Statistics
print("="*60)
print("RSITMD EXPERIMENT SUMMARY")
print("="*60)

print(f"""
Dataset: RSITMD
- Total images: ~4,743
- Captions per image: 5
- Resolution: 256×256

Split:
- Train: ~3,320 images (70%)
- Val: ~711 images (15%)
- Test: ~712 images (15%)

Model: RSGPT (InstructBLIP + Vicuna-13B)

Training (FIXED):
- Epochs: 15
- iters_per_epoch: ~830 (not 10000!)
- Learning rate: 1e-5
- Batch size: 4 (effective 16 with accumulation)
- Estimated time: ~2-3 hours on A100
""")

# Show best results if available
try:
    if rsitmd_metrics:
        print("Best Results:")
        for m, v in rsitmd_metrics.items():
            print(f"  {m}: {v:.4f}")
except:
    print("Run evaluation to see results.")

# Troubleshooting

## Common Issues

### 1. Import Error: `partially initialized module 'torchvision'`
**Solution:** Restart the runtime (Runtime → Restart runtime) and run the evaluation cell directly.

### 2. Training takes too long
**Fixed:** The `iters_per_epoch` is now set based on dataset size (~830) instead of 10000.

### 3. METEOR returns 0
**Solution:** Ensure Java is installed (`!apt-get install default-jdk`) and NLTK data is downloaded.

### 4. Only partial checkpoints saved
**Reason:** Previous training with `iters_per_epoch=10000` took ~40+ minutes per epoch. With the fix, each epoch takes ~3-4 minutes.

### 5. Resume from interrupted training
Use Step 5.2 to create a resume config and continue training.
