In [1]:
!pip install transformers torchvision


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [2]:

# 2. Imports
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from transformers import ViTFeatureExtractor, ViTForImageClassification
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt

# 3. Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 4. Transformations
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # Convert MNIST to 3 channels
    transforms.RandomApply([transforms.ColorJitter(brightness=0.1, contrast=0.1)], p=0.5),
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
])

# 5. Load MNIST datasets
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform_test)



100%|██████████| 9.91M/9.91M [00:00<00:00, 16.6MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 493kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.60MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 5.79MB/s]


In [3]:
# 6. Data loaders (low batch size for Colab RAM)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [4]:
# 7. Load pre-trained ViT and adapt classifier
from transformers import ViTForImageClassification

# Step 1: Load pretrained model without resizing classifier
model = ViTForImageClassification.from_pretrained(
    "facebook/deit-small-patch16-224"
)

# Step 2: Replace classifier head (1000 -> 10 classes for MNIST)
model.classifier = torch.nn.Linear(model.classifier.in_features, 10)

# Step 3: Move to device
model.to(device)

# 8. Optimizer and loss
optimizer = AdamW(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/88.3M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

In [5]:
# 9. Training loop
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct = 0, 0
    for images, labels in tqdm(loader, desc="Training"):
        images, labels = images.to(device), labels.to(device)

        outputs = model(images).logits
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (outputs.argmax(dim=1) == labels).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = total_correct / len(loader.dataset)
    return avg_loss, accuracy

In [6]:
# 10. Evaluation
def evaluate(model, loader):
    model.eval()
    total_correct = 0
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Evaluating"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            total_correct += (outputs.argmax(dim=1) == labels).sum().item()

    accuracy = total_correct / len(loader.dataset)
    return accuracy

# 11. Train for a few epochs (you can increase)
for epoch in range(2):
    print(f"\nEpoch {epoch+1}")
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_acc = evaluate(model, test_loader)
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")



Epoch 1



Training:   0%|          | 0/938 [00:00<?, ?it/s][A
Training:   0%|          | 1/938 [00:04<1:02:55,  4.03s/it][A
Training:   0%|          | 2/938 [00:04<31:15,  2.00s/it]  [A
Training:   0%|          | 3/938 [00:05<21:06,  1.35s/it][A
Training:   0%|          | 4/938 [00:05<16:32,  1.06s/it][A
Training:   1%|          | 5/938 [00:06<13:48,  1.13it/s][A
Training:   1%|          | 6/938 [00:06<12:09,  1.28it/s][A
Training:   1%|          | 7/938 [00:07<11:09,  1.39it/s][A
Training:   1%|          | 8/938 [00:08<10:23,  1.49it/s][A
Training:   1%|          | 9/938 [00:08<09:53,  1.56it/s][A
Training:   1%|          | 10/938 [00:09<09:35,  1.61it/s][A
Training:   1%|          | 11/938 [00:09<09:22,  1.65it/s][A
Training:   1%|▏         | 12/938 [00:10<09:13,  1.67it/s][A
Training:   1%|▏         | 13/938 [00:11<09:08,  1.69it/s][A
Training:   1%|▏         | 14/938 [00:11<09:03,  1.70it/s][A
Training:   2%|▏         | 15/938 [00:12<08:59,  1.71it/s][A
Training:   2%|▏     

Train Loss: 0.0732, Train Acc: 0.9776, Test Acc: 0.9892

Epoch 2


Training: 100%|██████████| 938/938 [09:35<00:00,  1.63it/s]
Evaluating: 100%|██████████| 157/157 [00:33<00:00,  4.72it/s]

Train Loss: 0.0316, Train Acc: 0.9904, Test Acc: 0.9936



