In [24]:
#First, we choose the pretrained model to use 

model_id = "google/vit-base-patch16-224"
#model_id = 'microsoft/swin-tiny-patch4-window7-224'
#model_id = 'facebook/deit-base-patch16-224'
#model_id = 'microsoft/resnet-50'


Now we load the feature extractor to process the image into a tensor.

In [25]:
from transformers import AutoFeatureExtractor, ViTFeatureExtractor, DeiTFeatureExtractor
#feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)
#feature_extractor = DeiTFeatureExtractor.from_pretrained(model_id)

loading feature extractor configuration file https://huggingface.co/google/vit-base-patch16-224/resolve/main/preprocessor_config.json from cache at /home/studio-lab-user/.cache/huggingface/transformers/caa0e8430c8ba68a0586cef2a661b39ea04de291f092f7c4277fc2d97f10cdb9.c322cbf30b69973d5aae6c0866f5cba198b5fe51a2fe259d2a506827ec6274bc
Feature extractor ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "size": 224
}



This feature extractor will resize every image to the resolution that the model expects and normalize channels. 

We define 2 functions, one for training and one for validation, including resizing, center cropping and normalizing.

In [26]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
train_transforms = Compose(
        [
            RandomResizedCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(feature_extractor.size),
            CenterCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["img"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["img"]]
    return example_batch

Next, we can preprocess our dataset by applying these functions.

In [27]:
#Load data
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
ds = load_from_disk('./fillfactor_dict_0.7-0.02')
ds

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 2050
    })
    val: Dataset({
        features: ['img', 'label'],
        num_rows: 228
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 402
    })
})

In [28]:
# split up training into training + validation
train_ds = ds['train']
val_ds = ds['val']
test_ds = ds['test']

In [29]:
#Classes names
labels = train_ds.features["label"].names
print(labels)

['proton', 'gamma']


In [30]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [31]:
train_ds[0]

{'img': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=288x288>,
 'label': 1,
 'pixel_values': tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]]])}

Now that our data is ready, we can download the pretrained model and fine-tune it. We use the modelViTForImageClassification.  

In [32]:
#We create a dictionary that maps a label name to an integer and vice versa. 
#The mapping will help the model recover the label name from the label number.

label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [33]:
from transformers import ViTForImageClassification, SwinForImageClassification,ResNetForImageClassification, TrainingArguments, Trainer, SegformerForImageClassification

model = ViTForImageClassification.from_pretrained(model_id,
                                                 label2id=label2id,
                                                 id2label=id2label,
                                                 ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
                                                 )

#model = SwinForImageClassification.from_pretrained(model_id,
#                                                 label2id=label2id,
#                                                 id2label=id2label,
#                                                 ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
#) 

#model = ResNetForImageClassification.from_pretrained(model_id,
#                                                 label2id=label2id,
#                                                 id2label=id2label,
#                                                 ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
#)




loading configuration file https://huggingface.co/google/vit-base-patch16-224/resolve/main/config.json from cache at /home/studio-lab-user/.cache/huggingface/transformers/6b03b61d64598274e01717c40e8909f9e70531219a281e8163bd5b3af5c92d1a.c41e6c561c79e9b15e74a5cc284a31cba59cb1a9e209933c1a04a46ba2e20e44
Model config ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "proton",
    "1": "gamma"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "gamma": "1",
    "proton": "0"
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.20.1"
}

loading weights file https://

The warning is telling us we are throwing away some weights (the weights and bias of the classifier layer) and randomly initializing some other (the weights and bias of a new classifier layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.

To instantiate a Trainer, we will need to define the training configuration and the evaluation metric. The most important is the TrainingArguments, which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model.

Most of the training arguments are pretty self-explanatory, but one that is quite important here is remove_unused_columns=False. This one will drop any features not used by the model's call function. By default it's True because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. But, in our case, we need the unused features ('image' in particular) in order to create 'pixel_values'.

In [34]:
model_name = model_id.split("/")[-1]
batch_size = 32
learning_rate = 5e-5
gradient_accumulation_steps = 4
epochs = 3
warmup_ratio= 0.1
logging_steps=10

args = TrainingArguments(
    f"{model_name}-finetuned-ds",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    warmup_ratio=warmup_ratio,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
   # push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Next, we need to define a function for how to compute the metrics from the predictions, which will just use the metric we loaded earlier. Let us also load the Accuracy metric, which we'll use to evaluate our model both during and after training. The only preprocessing we have to do is to take the argmax of our predicted logits:

In [35]:
import numpy as np

from datasets import load_metric

metric = load_metric("accuracy")

# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

We also define a collate_fn, which will be used to batch examples together. Each batch consists of 2 keys, namely pixel_values and labels.

In [36]:
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

Then we just need to pass all of this along with our datasets to the Trainer:

In [37]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

Now we can finetune our model by calling the train method:

In [38]:
train_results = trainer.train()


***** Running training *****
  Num examples = 2050
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 48


Epoch,Training Loss,Validation Loss,Accuracy
0,0.6658,0.541117,0.72807
1,0.4833,0.484872,0.833333
2,0.4562,0.340108,0.894737


***** Running Evaluation *****
  Num examples = 228
  Batch size = 32
Saving model checkpoint to vit-base-patch16-224-finetuned-ds/checkpoint-16
Configuration saved in vit-base-patch16-224-finetuned-ds/checkpoint-16/config.json
Model weights saved in vit-base-patch16-224-finetuned-ds/checkpoint-16/pytorch_model.bin
Feature extractor saved in vit-base-patch16-224-finetuned-ds/checkpoint-16/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 228
  Batch size = 32
Saving model checkpoint to vit-base-patch16-224-finetuned-ds/checkpoint-32
Configuration saved in vit-base-patch16-224-finetuned-ds/checkpoint-32/config.json
Model weights saved in vit-base-patch16-224-finetuned-ds/checkpoint-32/pytorch_model.bin
Feature extractor saved in vit-base-patch16-224-finetuned-ds/checkpoint-32/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 228
  Batch size = 32
Saving model checkpoint to vit-base-patch16-224-finetuned-ds/checkpoint-48
Configuration saved i

In [39]:

metrics = trainer.evaluate()
# some nice to haves:
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 228
  Batch size = 32


***** eval metrics *****
  epoch                   =       2.98
  eval_accuracy           =     0.8947
  eval_loss               =     0.3401
  eval_runtime            = 0:00:38.85
  eval_samples_per_second =      5.868
  eval_steps_per_second   =      0.206


In [40]:

test_ds.set_transform(preprocess_val)

In [41]:
outputs = trainer.predict(test_ds)
y_pred = outputs.predictions.argmax(1)

***** Running Prediction *****
  Num examples = 402
  Batch size = 32


In [42]:
compute_metrics(outputs)

{'accuracy': 0.9079601990049752}

In [43]:
outputs

PredictionOutput(predictions=array([[-4.46477473e-01,  4.98525172e-01],
       [ 9.78303850e-01, -4.14050281e-01],
       [ 1.92093587e+00, -1.66820168e+00],
       [ 1.44558942e+00, -1.31376505e+00],
       [ 2.91273981e-01, -3.13133150e-01],
       [ 1.39345181e+00, -1.11967039e+00],
       [ 1.51001453e-01, -1.90186992e-01],
       [ 7.21857965e-01, -5.31801105e-01],
       [ 2.96028495e-01, -2.86198884e-01],
       [ 4.30927724e-01, -3.54968488e-01],
       [ 4.89938259e-01, -4.73545313e-01],
       [-7.49841332e-03,  1.40016779e-01],
       [ 6.24491334e-01, -6.02689862e-01],
       [-1.80947185e-02,  6.90227747e-02],
       [ 6.24392450e-01, -5.79391778e-01],
       [ 2.54645407e-01, -1.89287797e-01],
       [ 4.79092181e-01, -3.78938556e-01],
       [-2.54943967e-02,  5.35873771e-02],
       [ 1.16291642e+00, -1.02925432e+00],
       [-3.84330511e-01,  3.99698168e-01],
       [ 1.18963957e-01, -8.70915651e-02],
       [ 1.83616877e-02,  7.56940544e-02],
       [-5.88526666e-01, 

In [44]:
y_pred

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
y_true = np.array(test_ds[:]['label'])
y_true

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[281,  12],
       [ 25,  84]])