In [None]:
!pip install uv
!uv pip install timm
!uv pip install scikit-learn
!uv pip install matplotlib
!uv pip install tqdm

import timm
import torch
import torch.nn as nn
import torch.optim as optim

### Download Dataset

In [None]:
# Upload kaggle.json
from google.colab import files
files.upload()

In [None]:
%%bash
mkdir -p ~/.kaggle
mv /content/kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d dumitrux/architectural-styles-dataset

### Loading Data

In [None]:
from torchvision import datasets, transforms

transform = transforms.Compose([
    transforms.Resize((384, 384)),  # match model input
    transforms.ToTensor(),          # convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

dataset = datasets.ImageFolder('./architectural-styles-dataset', transform=transform)
print(dataset.class_to_idx)

In [None]:
# Split into train/test
from torch.utils.data import random_split

# Define sizes
total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size  # to ensure total size is preserved

# Split dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Print sizes
print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=128, num_workers=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=32)

### Model Creation and Training

In [None]:
class NextViTPotatoClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model('nextvit_small.bd_ssld_6m_in1k_384', pretrained=True)
        self.backbone.head = nn.Linear(self.backbone.head.in_features, 25)

    def forward(self, x):
        features = self.backbone.forward_features(x)
        features = features.mean(dim=[2, 3])
        return self.backbone.head(features)

In [None]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NextViTPotatoClassifier().to(device)

for param in model.parameters():
    param.requires_grad = False  # freeze everything

# Unfreeze only the classifier head
for param in model.backbone.head.parameters():
    param.requires_grad = True


# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.backbone.head.parameters(), lr=4e-3)

In [None]:
#show trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


In [None]:
def reset_head_weights(model):
    for layer in model.backbone.head.modules():
        if isinstance(layer, nn.Linear):
            layer.reset_parameters()

def evaluate(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs = imgs.to(device)
            labels = labels.to(device)
            outputs = model(imgs)
            probs = torch.softmax(outputs, dim=1)
            _, pred_classes = torch.max(probs, dim=1)
            correct += (pred_classes == labels).sum().item()
            total += labels.size(0)
    return correct / total

In [None]:
from tqdm import tqdm

# Training loop
reset_head_weights(model)

epochs = 10
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    # Show progress bar per epoch
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for imgs, labels in loop:
        imgs = imgs.to(device)
        labels = labels.to(device)

        outputs = model(imgs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track loss
        train_loss += loss.item()

        # Track accuracy
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == labels).sum().item()
        train_total += labels.size(0)

        # Update tqdm status bar
        loop.set_postfix(loss=loss.item())

    train_acc = train_correct / train_total
    val_acc = evaluate(model, val_loader)

    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Train Acc: {train_acc:.2%} | Val Acc: {val_acc:.2%}")

    # Save checkpoint
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, 'checkpoint.pth')

### Model Evaluation and Testing

In [None]:
test_acc = evaluate(model, test_loader)
print(f"Testing Accuracy: {test_acc:.4f}")

### Save Model

In [None]:
!pip install onnx

In [None]:
# Export to ONNX
model.eval()
dummy_input = torch.randn(1, 3, 384, 384).to(device)

torch.onnx.export(
    model,
    dummy_input,
    "archinet.onnx",           # output file name
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
    opset_version=16,
    verbose=False,
)