In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Load Food-101 dataset

In [37]:
from datasets import load_dataset

food = load_dataset('food101', split='train[:5000]')

In [38]:
food

Dataset({
    features: ['image', 'label'],
    num_rows: 5000
})

In [39]:
food = food.train_test_split(test_size=0.2)

In [40]:
# take a look at an example
food['train'][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
 'label': 53}

In [41]:
labels = food['train'].features['label'].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [42]:
id2label[str(79)]

'prime_rib'

# Preprocess

The next step is to load a ViT image processor to process the image into a tensor:

In [43]:
# Using pytorch
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
processor = AutoImageProcessor.from_pretrained(checkpoint)

In [44]:
from torchvision.transforms import RandomResizedCrop, Compose, ToTensor, Normalize

normalize = Normalize(mean=processor.image_mean, std=processor.image_std)
size = (
    processor.size['shortest_edge']
    if 'shortest_edge' in processor.size
    else processor.size['height'], processor.size['width']
)

transform = Compose([RandomResizedCrop(size=size), ToTensor(), normalize])


In [45]:
def transforms(examples):
    examples['pixel_values'] = [transform(img.convert('RGB')) for img in examples['image']]
    del examples['image']
    return examples


In [46]:
# def transforms(examples):
#     examples["pixel_values"] = [
#         transform(image=np.array(image))["image"] for image in examples["image"]
#     ]

#     return examples

In [47]:
food = food.with_transform(transforms)

In [48]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Tensorflow

To avoid overfitting and to make the model more robust, add some data augmentation to the training part of the dataset. Here we use Keras preprocessing layers to define the transformations for the training data (includes data augmentation), and transformations for the validation data (only center cropping, resizing and normalizing). You can use tf.imageor any other library you prefer.


In [49]:
from tensorflow import keras
from tensorflow.keras import layers

size = (processor.size['height'], processor.size['width'])

train_data_augmentation = keras.Sequential(
    [
        layers.RandomCrop(size[0], size[1]),
        layers.Rescaling(scale=1.0 / 127.5, offset=-1),
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(factor=0.02),
        layers.RandomZoom(
            height_factor=0.2, width_factor=0.2, fill_mode="constant"
        ),
    ],
    name = "train_data_augmentation",
)

val_data_augmentation = keras.Sequential(
    [
        layers.CenterCrop(size[0], size[1]),
        layers.Rescaling(scale=1.0 / 127.5, offset=-1),
    ],
    name = 'val_data_augmentation',
)

In [50]:
import numpy as np
import tensorflow as tf
from PIL import Image

def convert_to_tf_tensor(image: Image):
    np_image = np.array(image)
    tf_image = tf.convert_to_tensor(np_image)
    return tf.expand_dims(tf_image, 0)

def preprocess_train(example_batch):
    images = [ train_data_augmentation(convert_to_tf_tensor(image.convert('RGB'))) for image in example_batch['image'] ]
    example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
    return example_batch

def preprocess_val(example_batch):
    images = [ val_data_augmentation(convert_to_tf_tensor(image.convert('RGB'))) for image in example_batch['image'] ]
    example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
    return example_batch

In [51]:
food["train"].set_transform(preprocess_train)
food["test"].set_transform(preprocess_val)

In [52]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors='tf')

In [53]:
import evaluate

accuracy = evaluate.load('accuracy')

In [54]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Pytorch

train finetuning model


In [55]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    seed=42,
    log_level='error',
    # evaluation_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=food["train"],
    eval_dataset=food["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


## Tensorflow 

fine-tuning a model with keras
To fine-tune a model in TensorFlow, follow these steps:

1. Define the training hyperparameters, and set up an optimizer and a learning rate schedule.
2. Instantiate a pre-trained model.
3. Convert a 🤗 Dataset to a tf.data.Dataset.
4. Compile your model.
5. Add callbacks and use the fit() method to run the training.
6. Upload your model to 🤗 Hub to share with the community.

Start by defining the hyperparameters, optimizer and learning rate schedule:

In [57]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 5
num_train_steps = len(food['train']) * num_epochs
learning_rate = 3e-5
weight_decay_rate = 0.01

optimizer, lr_schedule = create_optimizer(
    init_lr = learning_rate,
    num_train_steps = num_train_steps,
    weight_decay_rate = weight_decay_rate,
    num_warmup_steps = 0
)

In [58]:
# test on terminal to see thransformers 
# python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"

Then, load ViT with TFAutoModelForImageClassification along with the label mappings:

In [59]:
from transformers import TFAutoModelForImageClassification

model = TFAutoModelForImageClassification.from_pretrained(
    checkpoint,
    id2label = id2label,
    label2id = label2id,
)

In [60]:
tf_train_dataset = food['train'].to_tf_dataset(
    columns='pixel_values', 
             label_cols='label',
              shuffle=True,
               batch_size=batch_size,
                collate_fn=data_collator, 
)

tf_eval_dataset = food['test'].to_tf_dataset(
    columns='pixel_values', 
             label_cols='label',
              shuffle=True,
               batch_size=batch_size,
                collate_fn=data_collator, 
)


In [61]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [63]:
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
push_to_hub_callback = PushToHubCallback(
    output_dir="food_classifier",
    tokenizer=processor,
    save_strategy="no",
)

callbacks = [metric_callback, push_to_hub_callback]


Cloning https://huggingface.co/damiacc2/food_classifier into local empty directory.


In [65]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Upload file tf_model.h5: 336MB [00:27, 20.2MB/s]                            To https://huggingface.co/damiacc2/food_classifier
   ecc7655..f787658  main -> main

Upload file tf_model.h5: 100%|██████████| 328M/328M [00:28<00:00, 12.3MB/s]


<keras.src.callbacks.History at 0x7f93b0539a50>

## Inference 

Load an image you’d like to run inference on:

In [66]:
ds = load_dataset('food101', split='validation[:10]')
image = ds['image'][0]

In [67]:
from transformers import pipeline

classifier = pipeline('image-classification', model='food_classifier', tokenizer=processor)
classifier(image)

[{'score': 0.9651563167572021, 'label': 'beignets'},
 {'score': 0.0036974535323679447, 'label': 'chicken_wings'},
 {'score': 0.0035024266690015793, 'label': 'prime_rib'},
 {'score': 0.0022872083354741335, 'label': 'pork_chop'},
 {'score': 0.002025170950219035, 'label': 'hamburger'}]

In [68]:
from transformers import AutoImageProcessor

image_processor = AutoImageProcessor.from_pretrained('food_classifier')
inputs = image_processor(image, return_tensors='tf')

In [69]:
from transformers import TFAutoModelForImageClassification

model = TFAutoModelForImageClassification.from_pretrained('food_classifier')
logits = model(**inputs).logits

In [None]:
# predicted_class_id = int(tf.argmax(logits[0]))
# predicted_label = image_processor.labels[predicted_class_id]
# print("Predicted label:", predicted_label)


In [71]:
predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
model.config.id2label[predicted_class_id]

'beignets'