# 🌍 GeoViT: A Convolutional-Transformer Model for Geolocation Estimation

Welcome to the GeoViT project notebook!

This notebook presents the training, evaluation, and experimentation pipeline for **GeoViT**, a neural network model designed to **predict geographic locations from Google Street View images**. The model takes inspiration from the popular game *Geoguessr* and is trained using the [OpenStreetView-5M dataset](https://huggingface.co/datasets/osv5m/osv5m).

🖊️ Authors: Alan Tran and Caleb Wolf

---

## 📌 Project Goals

1. **Train** a hybrid convolutional-transformer model that can learn geospatial patterns from street-level imagery.
2. **Evaluate** the model using geodesic distance-based metrics.
3. **Experiment** with:
   - Vision Transformer ablations (layers & attention heads)
   - Robustness to reduced image context (square vs 3:2 aspect ratio)

---

## 🧠 Model Overview

- **Convolutional Frontend:** Captures local texture and object-level features.
- **Vision Transformer (ViT):** Captures global spatial dependencies.
- **Output:** Regressed GPS coordinates (Latitude, Longitude)

---

## 🧪 Experiments

### ✅ Experiment 1: ViT Ablation
- Reduce number of transformer layers and attention heads
- Assess contribution of transformer structure to geolocation performance

### ✅ Experiment 2: Robustness to Cropped Context
- Evaluate model on square images (less context)
- Compare against standard aspect ratio input

---

In [4]:
import os
import glob
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T
import torchvision.models as models
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from tqdm import tqdm
import timm
from s2sphere import LatLng, CellId
import heapq
import csv
from torch.utils.tensorboard import SummaryWriter

In [5]:
# Class that maps cells to grid indices
class Region:
    __slots__ = ('cell_id','level','indices')
    def __init__(self, cell_id, level, indices):
        self.cell_id = cell_id      # an s2sphere.CellId
        self.level   = level        # integer level
        self.indices = indices      # list of DataFrame indices
    def count(self): 
        return len(self.indices)
    def split(self, df, lat_col, lon_col):
        """Split into children at level+1 and group membership."""
        next_level = self.level + 1
        groups = {}
        for i in self.indices:
            lat, lon = df.at[i, lat_col], df.at[i, lon_col]
            cid = CellId.from_lat_lng(LatLng.from_degrees(lat, lon)) \
                         .parent(next_level)
            groups.setdefault(cid.id(), []).append(i)
        return [
            Region(CellId(child_id), next_level, idxs)
            for child_id, idxs in groups.items()
        ]

def build_planet_partitions(
    train_df: pd.DataFrame,
    lat_col: str = 'latitude',
    lon_col: str = 'longitude',
    t1: int = 10000,
    t2: int = 50
):
    """
    Performs PlaNet‐style adaptive partitioning on train_df,
    returns (labels, kept_regions):
      - labels: pd.Series of length train_df, with class 0…K-1 (or -1)
      - kept_regions: list of Region objects whose cells were kept (count >= t2)
    """
    # 1) Seed with level=0 roots
    root_ids = train_df.apply(
        lambda r: CellId
            .from_lat_lng(LatLng.from_degrees(r[lat_col], r[lon_col]))
            .parent(0)
            .id(),
        axis=1
    )
    roots = [
        Region(CellId(rid), 0, idxs.tolist())
        for rid, idxs in root_ids.groupby(root_ids).groups.items()
    ]

    # 2) Recursively split any region > t1

    # include cell_id.id() as a tie-breaker
    heap = [(-r.count(),  r.cell_id.id(),  r) for r in roots]
    heapq.heapify(heap)

    leaves = []
    while heap:
        negc, _, region = heapq.heappop(heap)
        if region.count() > t1:
            for child in region.split(train_df, lat_col, lon_col):
                # push with the same tuple structure
                heapq.heappush(heap, (-child.count(), child.cell_id.id(), child))
        else:
            leaves.append(region)

    # 3) Prune leaves < t2
    kept = [r for r in leaves if r.count() >= t2]

    # 4) Assign train labels
    train_labels = pd.Series(-1, index=train_df.index, dtype=int)
    for cls_idx, region in enumerate(kept):
        train_labels.loc[region.indices] = cls_idx

    return train_labels, kept

def assign_planet_labels(
    df: pd.DataFrame,
    regions: list,
    lat_col: str = 'latitude',
    lon_col: str = 'longitude',
) -> pd.Series:
    """
    Given any df and a list of kept Region objects (from build_planet_partitions),
    returns a pd.Series of –1 or 0…K-1 depending on which region each point falls into.
    """
    # Group regions by their level for faster lookup
    by_level = {}
    for cls_idx, reg in enumerate(regions):
        by_level.setdefault(reg.level, {})[reg.cell_id.id()] = cls_idx

    labels = pd.Series(-1, index=df.index, dtype=int)

    # For each unique level, compute all cell_ids in bulk and map
    for level, mapping in by_level.items():
        # compute cell_id.id() at this level for every point
        ids = df.apply(
            lambda r: CellId
                .from_lat_lng(LatLng.from_degrees(r[lat_col], r[lon_col]))
                .parent(level)
                .id(),
            axis=1
        )
        # map to class_idx (NaN becomes -1)
        mapped = ids.map(mapping).fillna(-1).astype(int)
        # only overwrite labels that are still -1
        mask = (labels == -1) & (mapped >= 0)
        labels.loc[mask] = mapped.loc[mask]

    return labels


In [6]:
# Import data
DATA_ROOT = 'osv5m/'
TRAIN_CSV = os.path.join(DATA_ROOT, 'train_mini.csv')
TEST_CSV = os.path.join(DATA_ROOT, 'test_mini.csv')
TRAIN_IMG_DIR = os.path.join(DATA_ROOT, 'train_images')
TEST_IMG_DIR = os.path.join(DATA_ROOT, 'test_images')

# Set global parameters
EPOCHS = 10
BATCH_SIZE = 64
L = 10
LABEL_COL = 'planet_class'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Read CSV files
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Build partitions on train and test
train_labels, kept_regions = build_planet_partitions(
    train_df, lat_col='latitude', lon_col='longitude',
    t1=100, t2=20
)
train_df[LABEL_COL] = train_labels
train_df = train_df[train_labels >= 0].reset_index(drop=True)

test_labels = assign_planet_labels(
    test_df, kept_regions, lat_col='latitude', lon_col='longitude'
)
test_df[LABEL_COL] = test_labels
test_df = test_df[test_labels >= 0].reset_index(drop=True)

In [7]:
print(train_df[LABEL_COL].unique()[:5])
print(train_df[LABEL_COL].nunique())

[ 88 668 884 311 985]
2199


In [8]:
# Define CNN + ViT hybrid model for geospatial classification
class CNN_ViT_Hybrid(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        # Conv feature extractor (ResNet50)
        self.cnn = timm.create_model("resnet50", pretrained=True, features_only=True)
        cnn_out_channels = self.cnn.feature_info[-1]['num_chs']

        # ViT block (tiny patch-based attention)
        self.vit = timm.create_model("vit_small_patch16_224", pretrained=True)
        self.vit.head = nn.Identity()  # remove classifier

        # Fusion + Classifier
        self.pool = nn.AdaptiveAvgPool2d((14, 14))
        self.proj = nn.Linear(cnn_out_channels, self.vit.embed_dim)

        self.dropout = nn.Dropout(p=0.2) # Dropout Regularization
        self.classifier = nn.Linear(self.vit.embed_dim, num_classes)

    def forward(self, x):
        # Get last feature map from CNN
        x = self.cnn(x)[-1]  # shape (B, C, H, W)

        # Pool to fixed 14 x 14 size
        x = self.pool(x)  # shape (B, C, 14, 14)

        # Flatten and transpose to patch seq format that matches ViT input
        x = x.flatten(2).transpose(1, 2)  # (B, C, H*W) -> (B, H*W, C)

        # Project to ViT embedding dim
        x = self.proj(x)  # shape (B, 196, D)

        # Feed through ViT encoder blocks
        x = self.vit.blocks(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.dropout(x)

        return self.classifier(x)
    
# Define the geospatial dataset class
class GeoDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        images_root: str,
        nclasses: int,
        label_col: str = 'planet_class',
        transforms=None
    ):
        # 1) keep only valid labels (>=0)
        self.df = df[df[label_col] >= 0].reset_index(drop=True)

        self.label_col = label_col
        self.classes = nclasses

        # 3) build a map from image‐ID → full path
        all_files = glob.glob(os.path.join(images_root, '*', '*.jpg'))
        self.id2path = {
            os.path.splitext(os.path.basename(p))[0]: p
            for p in all_files
        }

        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row    = self.df.iloc[idx]
        img_id = str(row['id'])
        label  = int(row[self.label_col])
        img    = Image.open(self.id2path[img_id]).convert('RGB')
        if self.transforms:
            img = self.transforms(img)
        return img, label

In [9]:
# Transformations for training data augmentation (better generalization)
train_transforms = T.Compose([
    T.Resize((224, 224)),
    T.RandomHorizontalFlip(),
    T.RandomApply([
        T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02)
    ], p=0.5),  # apply 50% of the time variations in color (simulates lighting changes)
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# Transformations for test data (no augmentation)
test_transforms = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Instantiate the dataset and dataloaders
train_ds = GeoDataset(
    df          = train_df,
    images_root = TRAIN_IMG_DIR,
    label_col   = LABEL_COL,
    nclasses=len(kept_regions),
    transforms  = train_transforms
)

test_ds = GeoDataset(
    df          = test_df,
    images_root = TEST_IMG_DIR,
    label_col   = LABEL_COL,
    nclasses=len(kept_regions),
    transforms  = test_transforms
)

print(f"Softmax size (num classes) = {train_ds.classes}")  # same for both

num_val = int(0.1 * len(train_ds)) # 90% training set, 10% testing set
num_train = len(train_ds) - num_val
train_subset, val_subset = random_split(train_ds, [num_train, num_val], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_subset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_loader  = DataLoader(test_ds,      batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# Model
model = CNN_ViT_Hybrid(num_classes=train_ds.classes).to(device)
criterion = nn.CrossEntropyLoss()

# Freeze ResNet
for param in model.cnn.parameters():
    param.requires_grad = False

# Freeze ViT
for param in model.vit.parameters():
    param.requires_grad = False

# Create optimizer only for trainable layers (ViT projection + classifier)
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()), 
    lr=1e-4, 
    weight_decay=1e-4
)

# Learning rate scheduler
# Reduce learning rate when validation loss plateaus
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,   # reduce LR by half
    patience=2,   # wait 2 epochs with no val loss improvement
    verbose=True
)

Softmax size (num classes) = 2199




In [10]:
# Get dimensions of training batch
for images, labels in train_loader:
    print(images.shape, labels.shape)
    break

torch.Size([64, 3, 224, 224]) torch.Size([64])


In [11]:
# Training and evaluation functions
def train_one_epoch(model, loader):
    model.train()
    running_loss, correct, total = 0, 0, 0
    for images, labels in tqdm(loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        correct += outputs.argmax(1).eq(labels).sum().item()
        total += labels.size(0)
    return running_loss / total, correct / total

def evaluate(model, loader):
    model.eval()
    running_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * images.size(0)
            correct += outputs.argmax(1).eq(labels).sum().item()
            total += labels.size(0)
    return running_loss / total, correct / total

In [None]:
log_file = open("training_log.csv", mode="w", newline="")
logger = csv.writer(log_file)
logger.writerow(["epoch", "train_loss", "train_acc", "val_loss", "val_acc", "lr"])

# TensorBoard writer for visualizing metrics
writer = SummaryWriter("runs/geo_model_experiment")

best_val_acc = 0.0
best_val_loss = float('inf')
# --- Training Loop ---
for epoch in range(EPOCHS):
    print(f"\n🌍 Epoch {epoch+1}/{EPOCHS}")

    # Unfreeze layers after 3 epochs
    if epoch == 3:
        print("🔓 Unfreezing layers...")
        for param in model.cnn.parameters():
            param.requires_grad = True
        for param in model.vit.parameters():
            param.requires_grad = True
        optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()), 
            lr=1e-5,
            weight_decay=1e-4
        )

        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=2,
            verbose=True
        )

    train_loss, train_acc = train_one_epoch(model, train_loader)
    val_loss, val_acc     = evaluate(model, val_loader)

    # Log scheduler learning rate
    scheduler.step(val_loss)
    for param_group in optimizer.param_groups:
        print(f"Current learning rate: {param_group['lr']}")

    # Save metrics to CSV file
    current_lr = optimizer.param_groups[0]['lr']
    logger.writerow([epoch+1, train_loss, train_acc, val_loss, val_acc, current_lr])
    log_file.flush()  # ensures it's written to disk

    # Log metrics to TensorBoard
    writer.add_scalar("Loss/train", train_loss, epoch)
    writer.add_scalar("Loss/val", val_loss, epoch)
    writer.add_scalar("Accuracy/train", train_acc, epoch)
    writer.add_scalar("Accuracy/val", val_acc, epoch)

    for param_group in optimizer.param_groups:
        writer.add_scalar("Learning Rate", param_group['lr'], epoch)



    print(f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
    print(f"Val   Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

    if val_loss < best_val_loss or val_acc > best_val_acc:
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save(model.state_dict(), f"hybrid_best_model_epoch{epoch+1}.pth")
        print("✅ Saved best model.")

log_file.close()
writer.close()


🌍 Epoch 1/10


  2%|▏         | 25/1583 [00:28<28:41,  1.10s/it]

After training, go to the terminal bash and run: tensorboard --logdir=runs
in order to see visualizations. Then go to http://localhost:6006

In [None]:
# --- Final Test ---
test_loss, test_acc = evaluate(model, test_loader)
print(f"\n✅ Final Test Loss: {test_loss:.4f}, Accuracy: {test_acc:.4f}")