# Fine-tune ViT on the Food-101 dataset to classify a food item in an image(PyTorch)

# Setup

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
# Login to HuggingFace
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Install all the necessary libraries
!pip install transformers datasets evaluate accelerate pillow torchvision scikit-learn

## Load Food-101 Dataset

In [24]:
from datasets import load_dataset

food = load_dataset("food101", split="train[:5000]")
food

Dataset({
    features: ['image', 'label'],
    num_rows: 5000
})

In [5]:
# Split the dataset into a train and test set
food = food.train_test_split(test_size=0.2)
food

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1000
    })
})

In [6]:
# Visualize an example from the dataset
food["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512>,
 'label': 81}

**Each example in the dataset has two fields:**

- `image:` a PIL image of the food item
- `label:` the label class of the food item

In [7]:
# Create a dictionart that maps the label name to an integer and vice-versa
labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# Convert the label id to a label name
id2label[str(45)]

'frozen_yogurt'

## Data Preprocessing

In [8]:
# Load the ViT image processor to process the image into a tensor
from transformers import ViTImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = ViTImageProcessor.from_pretrained(checkpoint)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [9]:
# Apply random crop, resize and normalize image transformations to the images
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)

_transforms = Compose(
    [
        RandomResizedCrop(size),
        ToTensor(),
        normalize,
    ]
)

In [10]:
# Define preprocessing function to apply transforms and return inputs of the model
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

# Apply over the entire dataset
food = food.with_transform(transforms)

In [11]:
# Create a batch of examples using Data Collator
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Evaluation Metrics

In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
# Define a function to calculate the accuracy
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(
        predictions=predictions,
        references=labels,
    )

# Training the Model

In [14]:
# Load ViT classifier model
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    label2id=label2id,
    id2label=id2label,
)

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Define Training Hyperparameters
training_args = TrainingArguments(
    output_dir="vit-finetuned-food101",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

In [16]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food["train"],
    eval_dataset=food["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

In [17]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,2.7649,2.573282,0.831
2,1.6461,1.626154,0.896


TrainOutput(global_step=186, training_loss=2.4808714825619935, metrics={'train_runtime': 503.1896, 'train_samples_per_second': 23.848, 'train_steps_per_second': 0.37, 'total_flos': 9.232831524962304e+17, 'train_loss': 2.4808714825619935, 'epoch': 2.976})

In [18]:
# Share the model to Hugging Face Hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/ashaduzzaman/vit-finetuned-food101/commit/7b0861ac3b658b09818dc46b8a5af26d8e9fbe37', commit_message='End of training', commit_description='', oid='7b0861ac3b658b09818dc46b8a5af26d8e9fbe37', pr_url=None, pr_revision=None, pr_num=None)

## Model Inference

In [19]:
# Load an image to run inference on
ds = load_dataset("food101", split="validation[:10]")
image = ds["image"][0]

### Inference with pipeline() Api

In [20]:
from transformers import pipeline

classifier = pipeline(
    "image-classification",
    model="ashaduzzaman/vit-finetuned-food101"
)

classifier(image)

config.json:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/344M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'beignets', 'score': 0.31782418489456177},
 {'label': 'bruschetta', 'score': 0.018193243071436882},
 {'label': 'prime_rib', 'score': 0.016965407878160477},
 {'label': 'pork_chop', 'score': 0.011782429181039333},
 {'label': 'chicken_wings', 'score': 0.011752397753298283}]

### Inference with Pytorch manually

In [21]:
# Load an image processor to preprocess the image and return the input as PyTorch tensors
from transformers import AutoImageProcessor
import torch

image_processor = AutoImageProcessor.from_pretrained("ashaduzzaman/vit-finetuned-food101")
inputs = image_processor(images=image, return_tensors="pt")

In [22]:
# Pass inputs to the model and return the logits
from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained("ashaduzzaman/vit-finetuned-food101")
with torch.no_grad():
    logits = model(**inputs).logits

In [23]:
# Get the predicted label with the highest probability and convert it to a label
predicted_label = logits.argmax(-1).item()
model.config.id2label[predicted_label]

'beignets'