In [None]:
# install what we need
# !pip3 install transformers datasets evaluate
# !pip3 install tensorflow

# for tensorflow to detect CUDA devices
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import (AutoImageProcessor, 
                          DefaultDataCollator, 
                          create_optimizer, 
                          TFAutoModelForImageClassification)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
import numpy as np
from PIL import Image

import evaluate

# login to the hugging face hub
notebook_login()

## Load the dataset

In [None]:
# we will load a subsection of the dataset to make initial work easier beofre using the whole thing
food = load_dataset("food101", split="train[:5000]")

In [None]:
# split into test and train
food = food.train_test_split(test_size=0.2)

# and look at a sample of the data
food["train"][0]

In [None]:
# as you can see above, each row in the dataset is a PIL image and a feature label
# the feature label corresponds to a food description of the image, saved in the features["labels"].names attribute of
# food["train"]

labels = food["train"].features["label"].names
for i, label in enumerate(labels[:10]):
    print(i, label)

In [None]:
# to make it easier for the model to get the label from the id, we can create a couple of mapping dicts
# that go id --> label and label --> id
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
    
id2label[str(45)]

## Preprocess

In [None]:
# now we have to load a ViT (Vision Transformer) image processor to process each image into a tensor which can 
# then be used by our models

# state the processor we want to use, let's use google's main one as it's very popular
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

# the image processor has built-in height and width dimensions for each image
print(image_processor.size["height"])
print(image_processor.size["width"])

In [None]:
# we should add some preprocessing layers to make our model a stronger predictor

size = (image_processor.size["height"], image_processor.size["width"])

# for training set
train_data_augmentation = keras.Sequential(
    [
        layers.RandomCrop(size[0], size[1]),
        layers.Rescaling(scale=1.0 / 127.5, offset=-1),
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(factor=0.02),
        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
    ],
    name="train_data_augmentation",
)

# for validation set
val_data_augmentation = keras.Sequential(
    [
        layers.CenterCrop(size[0], size[1]),
        layers.Rescaling(scale=1.0 / 127.5, offset=-1),
    ],
    name="val_data_augmentation",
)

In [None]:
# let's create some functions to apply our transformations to a batch of images rather than one at a time

def image_to_tensor(image: Image):
    np_image = np.array(image)
    tf_image = tf.convert_to_tensor(np_image)
    
    # `expand_dims()` is used to add a batch dimension since
    # the TF augmentation layers operates on batched inputs.
    tf_image = tf.expand_dims(tf_image, 0)
    
    return tf_image

def preprocess_train(example_batch):
    """Apply train_transforms across a batch"""
    
    images = [
        train_data_augmentation(image_to_tensor(image.convert("RGB"))) for image in example_batch["image"]
    ]
    example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
    
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch"""
    
    images = [
        val_data_augmentation(image_to_tensor(image.convert("RGB"))) for image in example_batch["image"]
    ]
    example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
    
    return example_batch

In [None]:
# the hugging face dataset class has a set_transform() method which can apply a transformation pipeline on the fly
food["train"].set_transform(preprocess_train)
food["test"].set_transform(preprocess_val)

In [None]:
# create a batch of samples using the default data collator
data_collator = DefaultDataCollator(return_tensors="tf")

## Evaluate

In [None]:
# to optimse during training we need to use a performance metric and function which can measure our model's 
# performance on the validation set

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc_score = accuracy.compute(predictions=predictions, references=labels)
    
    return acc_score

## Train

In [None]:
# define our hyperparams
batch_size = 16
num_epochs = 5
num_train_steps = len(food["train"]) * num_epochs
lr = 3e-5
weight_decay = 0.01

# create our optimzer
optimizer, lr_schedule = create_optimizer(
    init_lr=lr,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay,
    num_warmup_steps=0,
)

In [None]:
# the hugging face autoclasses create a model using the checkpoint I specify
model = TFAutoModelForImageClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# next we need to transform our dataset into the tf_dataset class
tf_train_dataset = food["train"].to_tf_dataset(
    columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
)

# same for eval
tf_eval_dataset = food["test"].to_tf_dataset(
    columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
)

In [None]:
# create our loss function and get model ready for trianing with compile
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

In [None]:
# callbacks customize the training loop for a model and inspect the training loop whilst its running, they can then
# do a few tasks such as report on metrics, send the model to the hub and enforce early stopping
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
# push_to_hub_callback = PushToHubCallback(
#     output_dir="food_classifier",
#     tokenizer=image_processor,
#     save_strategy="no",
# )
callbacks = metric_callback

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks)

In [None]:
model.push_to_hub("food-classifier")