# GeoLocateVLM Training on Colab

This notebook allows you to fine-tune the PaliGemma 3B model for geolocation using QLoRA on a free Google Colab T4 GPU.
It uses the **YFCC100M** dataset (via Hugging Face streaming) to train on real geolocation data.

In [None]:
# Install dependencies with specific upgrades to fix ImportErrors
# We install transformers from source to ensure PaliGemma support
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets torch torchvision torchaudio

print("Dependencies installed. PLEASE RESTART RUNTIME if you haven't already!")

In [None]:
from huggingface_hub import login
login() # You will need a HF token with write access

In [None]:
import torch
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset

model_id = "google/paligemma-3b-pt-224"

# Load Model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = PaliGemmaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

processor = PaliGemmaProcessor.from_pretrained(model_id)

In [None]:
# Freeze and Add LoRA
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Prepare Data using YFCC100M from Hugging Face
# We use streaming to avoid downloading the huge dataset
# We use 'Jia-py/MP16-Pro' which is a clean subset of YFCC100M with geolocation
dataset = load_dataset("Jia-py/MP16-Pro", split="train", streaming=True)

# Take a subset for fine-tuning (e.g., 2000 samples)
dataset = dataset.take(2000)

def format_data(example):
    # MP16-Pro has 'jpg' (image) and 'lat'/'lon'
    image = example.get("jpg")
    lat = example.get("lat")
    lon = example.get("lon")
    
    # Create target text
    target = f"{lat}, {lon}"
    prompt = "Where was this photo taken?"
    
    return {"image": image, "prompt": prompt, "target": target}

dataset = dataset.map(format_data)
# Filter out None images if any
dataset = dataset.filter(lambda x: x["image"] is not None)

def collate_fn(examples):
    images = [example["image"].convert("RGB") for example in examples]
    texts = [f"{example['prompt']} {example['target']}" for example in examples]
    
    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
    inputs["labels"] = inputs["input_ids"].clone()
    return inputs

args = TrainingArguments(
    output_dir="paligemma_geolocate",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=50
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    data_collator=collate_fn
)

trainer.train()

In [None]:
# Save Model
trainer.save_model("paligemma_geolocate_adapters")
# You can push to hub:
# model.push_to_hub("your-username/geolocate-vlm-adapters")