# Import the libraries

In [53]:
# Importing libraries
from tqdm import tqdm
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoModelForImageClassification,
    ViTFeatureExtractor,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
import evaluate
import numpy as np
import warnings
import os

from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor

# Set warnings to ignore to keep output clean
warnings.filterwarnings('ignore')

In [None]:
# We will be looking at the two following datasets: MNIST and Oxford IIIT Pets 
# Other potential datasets: FGVC Aircraft, caltech-101, food101, flowers-102
mnist_dataset = load_dataset("mnist")
oxford_dataset, num_oxford_classes = load_dataset("visual-layer/oxford-iiit-pet-vl-enriched")

num_mnist_classes, num_oxford_classes = 10, 37

# Choosing dataset which we will train on and creating val/train split
dataset = oxford_dataset
dataset = dataset["train"].train_test_split(test_size=0.15, shuffle=True, seed=1)
train_dataset = dataset['train']
val_dataset = dataset['test']

num_classes = num_oxford_classes

In [29]:
# Preprocessing dataset to be compatible with ViT
transform = Compose([
    Resize((224, 224)),
    ToTensor()
])

# Combined function to resize, convert to RGB, and then to tensor
def preprocess_images(batch):
    batch['pixel_values'] = [transform(image.convert("RGB")) for image in batch['image']]
    del batch['image']
    return batch

# Apply resizing
train_dataset = train_dataset.map(preprocess_images, batched=True)
val_dataset = val_dataset.map(preprocess_images, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3128/3128 [01:14<00:00, 41.97 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 552/552 [00:11<00:00, 48.91 examples/s]


In [46]:
# Preprocessing for the labels
label_encoder = LabelEncoder()
label_column_name = 'label_breed' # To use when training on oxford pet dataset

def label_preprocessing(dataset):
    # Fit the encoder on the string labels and transform them to integer labels
    label_encoder.fit(dataset[label_column_name])
    encoded_labels = label_encoder.transform(dataset[label_column_name])

    # Add the encoded labels as a new column in the dataset
    return dataset.add_column('label', encoded_labels)

# Apply preprocessing
train_dataset = label_preprocessing(train_dataset)
val_dataset = label_preprocessing(val_dataset)

In [66]:
# Whether to use LoRA or not
apply_LoRA = False

# LoRA hyper parameters
LoRA_r = 64
LORA_a = 8
LORA_dropout = 0.1

layers = ["query", "key", "value"]
target_modules = [f"vit.encoder.layer.{i}.attention.attention.{layer}" for i in range(0, 12) for layer in layers]

# Effective batch size = per_device_batch_size * gradient_accumulation_steps
per_device_batch_size = 8
gradient_accumulation_steps = 2

# Set up LoRA configuration
lora_config = LoraConfig(
    r=LoRA_r,
    lora_alpha=LORA_a,
    target_modules=target_modules,
    lora_dropout=LORA_dropout,
    use_rslora=True,
)

# Load model and tokenizer
model_name = "google/vit-base-patch16-224" 
model = AutoModelForImageClassification.from_pretrained(model_name, num_labels = num_classes, ignore_mismatched_sizes=True)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name) if model_name else None

# Apply LoRA to the model
if apply_LoRA:
    model = get_peft_model(model, lora_config)

# Move model to GPU
model = model.to("cuda")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([37]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([37, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
# Define accuracy metric
accuracy = evaluate.load("accuracy")

# Define the compute_metrics function to calculate accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

run_name = "ViT_pet"
training_args = TrainingArguments(
    output_dir=f"results/{run_name}",
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    max_steps=500,
    logging_steps=20,
    eval_steps=20,
    save_steps=50,
    save_total_limit=1,
    evaluation_strategy="steps"
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy
20,3.4023,3.097435,0.289855
40,2.652,2.384443,0.538043
60,2.2077,1.805555,0.724638
80,1.5291,1.349178,0.813406
100,1.2058,1.025223,0.847826
120,0.983,0.812894,0.880435
140,0.784,0.654214,0.907609
160,0.5783,0.545256,0.931159
180,0.5431,0.49147,0.90942
200,0.4375,0.445341,0.916667


TrainOutput(global_step=500, training_loss=0.6501505739688873, metrics={'train_runtime': 1738.1034, 'train_samples_per_second': 4.603, 'train_steps_per_second': 0.288, 'total_flos': 6.20130387050496e+17, 'train_loss': 0.6501505739688873, 'epoch': 2.557544757033248})