# CountryGuessr - Country Prediction from Street View Images

In [2]:
# Dependencies
%pip install -q torch torchvision
%pip install -q pycountry tqdm matplotlib seaborn tensorboard kagglehub timm evaluate grad-cam


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Download Dataset
Use example dataset from Kaggle for initial setup, since we do not have a fixed dataset yet.

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sylshaw/streetview-by-country")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/sylshaw/streetview-by-country/versions/2


# Hyperparameters

In [4]:
import random
import numpy as np
import torch

# Set random seeds
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

NUM_EPOCHS = 5
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 1e-4
LR_SCHEDULER_PATIENCE = 2
LR_SCHEDULER_FACTOR = 0.1
EARLY_STOPPING_PATIENCE = 3
LOG_DIR = "runs/country_classifier"
CHECKPOINT_DIR = "model"

print(f"Using device: {DEVICE}")
print(f"Seed set to: {RANDOM_SEED}")
print(f"Num epochs: {NUM_EPOCHS}, Batch size: {BATCH_SIZE}, Learning rate: {LEARNING_RATE}")
print(f"TensorBoard log dir: {LOG_DIR}")
print(f"Model checkpoint dir: {CHECKPOINT_DIR}")

Using device: cuda
Seed set to: 42
Num epochs: 5, Batch size: 32, Learning rate: 0.0001
TensorBoard log dir: runs/country_classifier
Model checkpoint dir: model


# Data Preparation
- Split into training, validation, test sets
- Put data in correct format -> each country represents a separate class
- Create label map to map country code to country name

In [5]:
import torch
import os
from torchvision import transforms
from torch.utils.data import random_split, DataLoader
from src.dataset import StreetViewDataset


DATASET_DIR = os.path.join(path, "streetview_images")
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1

transform = transforms.Compose([
  transforms.Resize((224, 224)),
  transforms.ToTensor()
])

dataset = StreetViewDataset(DATASET_DIR, transform=transform)
train_size = int(TRAIN_SPLIT * len(dataset))
val_size = int(VAL_SPLIT * len(dataset))
test_size = len(dataset) - (train_size + val_size)

train_set, val_set, test_set = random_split(
  dataset,
  [train_size, val_size, test_size],
  generator=torch.Generator().manual_seed(RANDOM_SEED)
)

train_loader = DataLoader(train_set, BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, BATCH_SIZE)
test_loader = DataLoader(test_set, BATCH_SIZE)

# Model Choices
There are 3 different model options used here to compare performance between model architectures.

In [6]:
import torch.nn as nn

num_classes = len(dataset.label_map)
print(num_classes)

111


### ResNet50 (CNN) - [Docs](https://docs.pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html)
- CNN architecture with added residual (skip) connections
- Enables deeper networks without training difficulties (Vanishing Gradient, Degradation Problem)

In [7]:
from src.model import ResNet50Model

model = ResNet50Model(num_classes, pretrained=True)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)



### Base Vision Transformer (ViT) - [Docs](https://huggingface.co/docs/transformers/model_doc/vit)
- Transformer for computer vision tasks
- Splits images into fixed-sized patches, treating them as a sequence of tokens

In [8]:
from src.model import ViTModel

model = ViTModel(num_classes, pretrained=True)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

### Swin Transformer -  [Docs](https://huggingface.co/docs/transformers/model_doc/swin)

Could be good model for future - **not working yet**

- Hierarchical vision transformer with shifted windows (swin)
- Divides images into patches and applies windowed self-attention to capture local features
- Uses shifted windows to enable cross-window connections and global context modeling

In [None]:
from src.model import SwinTransformerModel

num_classes = len(dataset.label_map)

model = SwinTransformerModel(num_classes, pretrained=True)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training/Validation

## Logging
- Log training loss and accuracy during training phase
- Compute and log validation loss, accuracy, precision, recall, and F1-Score
- Generate and log confusion matrix heatmap images
- Visualize all metrics and images using TensorBoard

## Learning Rate Scheduler
- "ReduceLROnPlateau" scheduler to reduce learning rate when validation loss plateaus
- Helps to fine-tune training progression and prevent getting stuck in local minima

## Early Stopping
- Monitor validation loss to detect noo improvement over specified patience period
- Stops training early if no improvement is observed

## Checkpoint Saving
- Save model checkpoints at the end of each epoch
- Update a dedicated "best" checkpoint whenever validation loss improves

In [None]:
import os
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns
import io
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter
from src.train import train_epoch, validate_epoch, save_checkpoint

scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=LR_SCHEDULER_FACTOR, patience=LR_SCHEDULER_PATIENCE)
writer = SummaryWriter("runs/country_classifier")

# Load metrics
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
confusion_metric = evaluate.load("confusion_matrix")

def plot_confusion_matrix(cm, labels):
    plt.figure(figsize=(10, 10))
    sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    plt.close()

    image = plt.imread(buf)
    buf.close()
    return image

global_step = 0
best_val_loss = float("inf")
epochs_without_improvement = 0

print(f"Using {model.__class__.__name__}")
for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")

    # Train/Val
    train_loss, train_acc, global_step = train_epoch(model, train_loader, criterion, optimizer, DEVICE, writer, global_step)
    val_loss, val_acc, val_preds, val_labels, global_step = validate_epoch(model, val_loader, criterion, DEVICE, writer, global_step)

    # Quantitative Metrics
    precision_metric.add_batch(predictions=val_preds, references=val_labels)
    val_precision = precision_metric.compute(average="macro")["precision"]

    recall_metric.add_batch(predictions=val_preds, references=val_labels)
    val_recall = recall_metric.compute(average="macro")["recall"]
    
    f1_metric.add_batch(predictions=val_preds, references=val_labels)
    val_f1 = f1_metric.compute(average="macro")["f1"]

    confusion_metric.add_batch(predictions=val_preds, references=val_labels)
    cm_result = confusion_metric.compute()["confusion_matrix"]
    cm_image = plot_confusion_matrix(cm_result, dataset.label_map)
    cm_tensor = transforms.ToTensor()(cm_image).unsqueeze(0)

    # Tensorboard logging
    writer.add_scalar("Validation/Precision", val_precision, epoch)
    writer.add_scalar("Validation/Recall", val_recall, epoch)
    writer.add_scalar("Validation/F1-Score", val_f1, epoch)
    writer.add_image("Validation/Confusion_Matrix", cm_tensor[0], epoch)

    # Learning rate scheduler
    scheduler.step(val_loss)

    # Store checkpoint after epoch
    save_checkpoint(model, optimizer, epoch, dataset.label_map, train_loss=train_loss, val_loss=val_loss, train_acc=train_acc, val_acc=val_acc, checkpoint_dir=os.path.join(CHECKPOINT_DIR, model.__class__.__name__))
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - train_loss={train_loss:.4f}, train_acc={train_acc:.4f}")

    # Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0

        # Update best checkpoint if val_loss decreases
        save_checkpoint(model, optimizer, epoch, dataset.label_map, train_loss=train_loss, val_loss=val_loss, train_acc=train_acc, val_acc=val_acc, checkpoint_dir=os.path.join(CHECKPOINT_DIR, model.__class__.__name__), filename="best.pth")
        print(f"Best model updated at Epoch {epoch+1} with val_loss={val_loss:.4f}")
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= EARLY_STOPPING_PATIENCE:
        print(f"No improvement for {EARLY_STOPPING_PATIENCE} epochs. Early stopping.")
        break

Using ViTModel
Epoch 1/5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Checkpoint saved to model/ViTModel/epoch_0.pth
Epoch 1/5 - train_loss=3.1060, train_acc=0.2465
Checkpoint saved to model/ViTModel/best.pth
Best model updated at Epoch 1 with val_loss=2.1011
Epoch 2/5


Training:   2%|‚ñè         | 44/2682 [00:09<09:52,  4.45it/s, acc=0.4929, loss=1.8391]

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs/

# Inferencing
Models are inferenced on whole test set quantitatively and qualitatively on single images

## Load models

In [None]:
# ResNet
checkpoint = torch.load('model/ResNet50Model/best.pth')
model = ResNet50Model(num_classes=len(checkpoint["label_map"]), pretrained=False)
model.load_state_dict(checkpoint["model_state_dict"])
model = model.to(DEVICE)
model.eval()

In [None]:
import torch
from src.model import ViTModel

# Vision Transformer
checkpoint = torch.load('model/ViTModel/best.pth')
model = ViTModel(num_classes=len(checkpoint["label_map"]), pretrained=False)
model.load_state_dict(checkpoint["model_state_dict"])
model = model.to(DEVICE)
model.eval()

## Inference on whole test set
- Creates a csv with the image filenames, the ground-truth labels, and the predicted labels
- Used for further analyzation to detect weaknesses, strengths, ...

In [None]:
from src.evaluation import inference
import pandas as pd

test_images, test_preds, test_labels = inference(model, test_loader, dataset.label_map, DEVICE)

results_df = pd.DataFrame({
    "Image": test_images,
    "TrueLabel": test_labels,
    "PredictedLabel": test_preds
})

results_df.to_csv("model/test_inference_results.csv", index=False)
print(f"Saved inference results for {len(test_preds)} samples to test_inference_results.csv")

## Inference single image
- Inferences single, random images from the test set
- Qualitative evaluation over model predictions
- Grad-CAM heatmaps to visualize parts of image the prediction is based on

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget

def reshape_transform(tensor, height=14, width=14):
    result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2))
    result = result.transpose(2, 3).transpose(1, 2)
    return result

target_layers = [model.vit.blocks[-1].norm1]

idx = random.randint(0, len(test_set) - 1)
top_k = 5

model.eval()
image, label, image_path = dataset[idx]
image_tensor = image.unsqueeze(0).to(DEVICE) 

with torch.no_grad():
    outputs = model(image_tensor)
    probabilities = torch.softmax(outputs, dim=1).cpu().numpy()[0]

topk_indices = probabilities.argsort()[-top_k:][::-1]
topk_probs = probabilities[topk_indices]

idx_to_name = {v: k for k, v in dataset.label_map.items()}
topk_countries = [idx_to_name[i] for i in topk_indices]

gt_country = idx_to_name[label]

# Grad-CAM
cam = GradCAM(model=model, target_layers=target_layers, reshape_transform=reshape_transform)
targets = [ClassifierOutputTarget(topk_indices[0])]
grayscale_cam = cam(input_tensor=image_tensor, targets=targets)[0]

input_image = image.permute(1, 2, 0).cpu().numpy()
input_image = (input_image - input_image.min()) / (input_image.max() - input_image.min())

cam_image = show_cam_on_image(input_image, grayscale_cam, use_rgb=True)

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

axs[0].imshow(input_image)
axs[0].axis('off')
axs[0].set_title(f"Ground Truth: {gt_country}")

axs[1].imshow(cam_image)
axs[1].axis('off')
axs[1].set_title(f"Grad-CAM: {topk_countries[0]} ({topk_probs[0]*100:.2f}%)")

print("Predicted probabilities:")
for country, prob in zip(topk_countries, topk_probs):
    print(f"{country}: {prob*100:.2f}%")