In [1]:
#First, we choose the pretrained model to use 

model_id = "google/vit-base-patch16-224"
#model_id = 'microsoft/swin-tiny-patch4-window7-224'
#model_id = 'facebook/deit-base-patch16-224'

Now we load the ViT feature extractor to process the image into a tensor.

In [2]:
from transformers import AutoFeatureExtractor, ViTFeatureExtractor
#feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

This feature extractor will resize every image to the resolution that the model expects and normalize channels. 

We define 2 functions, one for training and one for validation, including resizing, center cropping and normalizing.

In [3]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
train_transforms = Compose(
        [
            RandomResizedCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(feature_extractor.size),
            CenterCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["img"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["img"]]
    return example_batch

Next, we can preprocess our dataset by applying these functions.

In [4]:
#Load data
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
ds = load_from_disk('./data_dict')
ds

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 4896
    })
    val: Dataset({
        features: ['img', 'label'],
        num_rows: 545
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 961
    })
})

In [5]:
# split up training into training + validation
train_ds = ds['train']
val_ds = ds['val']
test_ds = ds['test']

In [6]:
#Classes names
labels = train_ds.features["label"].names
print(labels)

['iron', '.ipynb_checkpoints', 'gamma', 'proton']


In [7]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)



In [8]:
train_ds[0]

{'img': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=288x288>,
 'label': 2,
 'pixel_values': tensor([[[0.9765, 0.9765, 0.9765,  ..., 1.0000, 1.0000, 1.0000],
          [0.9765, 0.9765, 0.9765,  ..., 1.0000, 1.0000, 1.0000],
          [0.9765, 0.9686, 0.9765,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
    

Now that our data is ready, we can download the pretrained model and fine-tune it. We use the modelViTForImageClassification.  

In [9]:
#We create a dictionary that maps a label name to an integer and vice versa. 
#The mapping will help the model recover the label name from the label number.

label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [10]:
from transformers import ViTForImageClassification, TrainingArguments, Trainer

model = ViTForImageClassification.from_pretrained(model_id,
                                                 label2id=label2id,
                                                 id2label=id2label,
                                                ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

2022-07-09 14:19:39.915207: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-09 14:19:39.915239: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The warning is telling us we are throwing away some weights (the weights and bias of the classifier layer) and randomly initializing some other (the weights and bias of a new classifier layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.

To instantiate a Trainer, we will need to define the training configuration and the evaluation metric. The most important is the TrainingArguments, which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model.

Most of the training arguments are pretty self-explanatory, but one that is quite important here is remove_unused_columns=False. This one will drop any features not used by the model's call function. By default it's True because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in our case, we need the unused features ('image' in particular) in order to create 'pixel_values'.

In [11]:
model_name = model_id.split("/")[-1]
batch_size = 32
learning_rate = 5e-5
gradient_accumulation_steps = 4
epochs = 3
warmup_ratio= 0.1
logging_steps=10

args = TrainingArguments(
    f"{model_name}-finetuned-ds",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    warmup_ratio=warmup_ratio,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
   # push_to_hub=True,
)

Next, we need to define a function for how to compute the metrics from the predictions, which will just use the metric we loaded earlier. Let us also load the Accuracy metric, which we'll use to evaluate our model both during and after training. The only preprocessing we have to do is to take the argmax of our predicted logits:

In [12]:
import numpy as np

from datasets import load_metric

metric = load_metric("accuracy")

# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

We also define a collate_fn, which will be used to batch examples together. Each batch consists of 2 keys, namely pixel_values and labels.

In [13]:
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

Then we just need to pass all of this along with our datasets to the Trainer:

In [14]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

  if not hasattr(tensorboard, '__version__') or LooseVersion(tensorboard.__version__) < LooseVersion('1.15'):


Now we can finetune our model by calling the train method:

In [15]:
train_results = trainer.train()
# rest is optional but nice to have
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 4896
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 114


Epoch,Training Loss,Validation Loss,Accuracy
0,0.0975,0.028156,0.994495
1,0.0479,0.024692,0.988991
2,0.0249,0.018753,0.992661


***** Running Evaluation *****
  Num examples = 545
  Batch size = 32
Saving model checkpoint to vit-base-patch16-224-finetuned-ds/checkpoint-38
Configuration saved in vit-base-patch16-224-finetuned-ds/checkpoint-38/config.json
Model weights saved in vit-base-patch16-224-finetuned-ds/checkpoint-38/pytorch_model.bin
Feature extractor saved in vit-base-patch16-224-finetuned-ds/checkpoint-38/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 545
  Batch size = 32
Saving model checkpoint to vit-base-patch16-224-finetuned-ds/checkpoint-76
Configuration saved in vit-base-patch16-224-finetuned-ds/checkpoint-76/config.json
Model weights saved in vit-base-patch16-224-finetuned-ds/checkpoint-76/pytorch_model.bin
Feature extractor saved in vit-base-patch16-224-finetuned-ds/checkpoint-76/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 545
  Batch size = 32
Saving model checkpoint to vit-base-patch16-224-finetuned-ds/checkpoint-114
Configuration saved 

***** train metrics *****
  epoch                    =         2.99
  total_flos               = 1057743055GF
  train_loss               =       0.1393
  train_runtime            =   2:56:29.85
  train_samples_per_second =        1.387
  train_steps_per_second   =        0.011


In [16]:

metrics = trainer.evaluate()
# some nice to haves:
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 545
  Batch size = 32


***** eval metrics *****
  epoch                   =       2.99
  eval_accuracy           =     0.9945
  eval_loss               =     0.0282
  eval_runtime            = 0:01:43.56
  eval_samples_per_second =      5.262
  eval_steps_per_second   =      0.174


In [18]:

test_ds.set_transform(preprocess_val)

In [19]:
outputs = trainer.predict(test_ds)
y_pred = outputs.predictions.argmax(1)

***** Running Prediction *****
  Num examples = 961
  Batch size = 32


In [20]:
compute_metrics(outputs)

{'accuracy': 0.9906347554630593}

In [21]:
outputs

PredictionOutput(predictions=array([[-2.7944756 , -2.1404853 ,  6.234772  , -0.2152647 ],
       [-2.6935065 , -1.9954867 ,  6.3344984 , -0.38910994],
       [ 4.3002853 , -2.8543997 , -1.7055428 ,  1.0094544 ],
       ...,
       [-1.7866147 , -2.68148   ,  4.5279064 ,  0.73286015],
       [ 6.003376  , -1.6580932 , -2.591699  , -0.7850289 ],
       [-1.0834728 , -2.7573643 , -1.1408011 ,  4.8466167 ]],
      dtype=float32), label_ids=array([2, 2, 0, 0, 2, 0, 3, 2, 2, 3, 0, 0, 0, 3, 0, 0, 2, 0, 2, 2, 3, 3,
       2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 0, 3, 0, 0, 2, 0, 0, 2, 3, 2,
       2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 3, 0, 3, 3, 2, 2, 2,
       2, 0, 3, 2, 2, 3, 3, 3, 2, 2, 0, 0, 3, 2, 0, 2, 3, 2, 0, 2, 2, 3,
       0, 2, 3, 2, 2, 3, 0, 3, 2, 2, 3, 3, 0, 2, 0, 0, 2, 2, 0, 3, 0, 2,
       2, 2, 2, 3, 2, 2, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 0, 2, 2, 2, 3, 2,
       2, 0, 0, 0, 0, 2, 3, 3, 2, 3, 2, 2, 2, 0, 2, 2, 0, 0, 0, 3, 2, 0,
       0, 0, 3, 2, 3, 3, 2, 2, 3, 2, 0, 3, 0, 0, 

In [23]:
y_pred

array([2, 2, 0, 0, 2, 0, 3, 2, 2, 3, 0, 0, 0, 3, 0, 0, 2, 0, 2, 2, 3, 3,
       2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 0, 3, 0, 0, 2, 0, 0, 2, 3, 2,
       2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 3, 0, 3, 3, 2, 2, 2,
       2, 0, 3, 2, 2, 3, 3, 3, 2, 2, 0, 0, 3, 2, 0, 2, 3, 2, 0, 2, 2, 3,
       0, 2, 3, 2, 2, 3, 0, 3, 2, 2, 3, 3, 0, 2, 0, 0, 2, 2, 0, 3, 0, 2,
       2, 2, 2, 3, 2, 2, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 0, 2, 2, 2, 3, 2,
       2, 0, 0, 0, 0, 2, 3, 3, 2, 3, 2, 2, 2, 0, 2, 2, 0, 0, 0, 3, 2, 0,
       0, 0, 3, 2, 3, 3, 2, 2, 3, 2, 0, 3, 0, 0, 2, 0, 3, 3, 3, 3, 2, 0,
       3, 0, 0, 2, 0, 2, 0, 0, 2, 3, 3, 0, 3, 0, 3, 3, 2, 2, 0, 3, 2, 2,
       2, 3, 3, 3, 0, 2, 2, 3, 2, 2, 0, 0, 3, 0, 3, 0, 0, 3, 0, 3, 3, 2,
       0, 0, 0, 2, 0, 3, 2, 0, 3, 2, 2, 2, 2, 0, 2, 3, 0, 3, 0, 3, 3, 2,
       3, 0, 2, 3, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 3, 2, 2, 3, 2,
       2, 3, 3, 2, 3, 2, 2, 2, 0, 2, 0, 0, 3, 0, 0, 0, 3, 2, 2, 0, 3, 0,
       2, 2, 3, 0, 0, 2, 0, 3, 2, 0, 3, 0, 2, 0, 2,

In [None]:
from transformers import ViTForImageClassification, TrainingArguments, Trainer
model = ViTForImageClassification.from_pretrained('deit-base-patch16-224-finetuned-ds', local_files_only=True)
