**Pre-Trained ViT Model**

In [None]:
import zipfile
import os

zip_path = '/content/drive/MyDrive/Train.zip'  # update with your zip file path
extract_to = '/content/drive/MyDrive'  # change if you want to unzip to a different folder

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Unzipped to:", extract_to)


Unzipped to: /content/drive/MyDrive


In [None]:
# Install necessary libraries
!pip install -q transformers datasets torch torchvision

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install -U transformers datasets



In [None]:
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch

# Step 1: Load image processor
image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")

# Step 2: Define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
])

# Step 3: Load dataset
dataset = ImageFolder(root="/content/drive/MyDrive/Train", transform=transform)

# Step 4: Convert to Hugging Face Dataset format
def transform_example(example):
    return {
        "pixel_values": example[0],
        "label": example[1],
    }

hf_dataset = Dataset.from_generator(lambda: map(transform_example, dataset))

# Step 5: Split dataset
hf_dataset = hf_dataset.train_test_split(test_size=0.1)
train_ds = hf_dataset["train"]
val_ds = hf_dataset["test"]

train_ds.set_format(type="torch")
val_ds.set_format(type="torch")

# Step 6: Load model
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=10,
    ignore_mismatched_sizes=True
)

# Step 7: Define metric function
def compute_metrics(preds, labels):
    pred_labels = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, pred_labels)
    return {"accuracy": acc}

# Step 8: Training arguments (remove evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./vit-results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor  # still required
)

# Step 10: Train manually with evaluation per epoch
for epoch in range(int(training_args.num_train_epochs)):
    print(f"\nEpoch {epoch + 1}/{int(training_args.num_train_epochs)}")
    trainer.train()
    eval_result = trainer.evaluate()
    preds = np.argmax(eval_result['logits'], axis=1) if 'logits' in eval_result else None
    if preds is not None:
        acc = compute_metrics(preds, eval_result['label_ids'])
        print(f"Validation Accuracy: {acc['accuracy']:.4f}")
    else:
        print(f"Validation Accuracy: {eval_result.get('eval_accuracy', 'N/A'):.4f}")
    print(f"Validation Loss: {eval_result.get('eval_loss', 'N/A'):.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch 1/5


Step,Training Loss
10,1.7182
20,0.7605
30,0.3953
40,0.1755
50,0.146
60,0.0881
70,0.0715
80,0.0589
90,0.0613
100,0.0523


ValueError: Unknown format code 'f' for object of type 'str'

In [None]:
# Prediction and custom accuracy
predictions = trainer.predict(val_ds)
acc = compute_metrics(predictions.predictions, predictions.label_ids)
print(f"Validation Accuracy: {acc['accuracy']:.4f}")


Validation Accuracy: 0.9974
