# BEiT Vision transformer Model on the Octmnist dataset from parent repository of MedMnist

Model Idea and Code provided by: 

@misc{https://doi.org/10.48550/arxiv.2106.08254,
  doi = {10.48550/ARXIV.2106.08254},
  
  url = {https://arxiv.org/abs/2106.08254},
  
  author = {Bao, Hangbo and Dong, Li and Piao, Songhao and Wei, Furu},
  
  keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  
  title = {BEiT: BERT Pre-Training of Image Transformers},
  
  publisher = {arXiv},
  
  year = {2021},
  
  copyright = {arXiv.org perpetual, non-exclusive license}
}

In [None]:
!pip install datasets transformers
!pip install datasets
!pip install medmnist

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[K     |████████████████████████████████| 452 kB 14.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 69.6 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 74.4 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 71.3 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 74.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1

In [None]:
from PIL import Image
import datasets
import numpy as np
import pandas as pd
from medmnist import INFO, Evaluator
from transformers import BeitImageProcessor, BeitModel , BeitForImageClassification
from torchvision import transforms
import medmnist
import torch

In [None]:
data_flag = 'octmnist'
download = True
info = INFO[data_flag]
task = info['task']
n_channels = info['n_channels']
n_classes = len(info['label'])

In [None]:
DataClass=getattr(medmnist, info['python_class'])

In [None]:
train_dataset= DataClass(split='train',download=download)
val_dataset=DataClass(split='val',download=download)
test_dataset=DataClass(split='test',download=download)

Downloading https://zenodo.org/record/6496656/files/octmnist.npz?download=1 to /root/.medmnist/octmnist.npz


  0%|          | 0/54938180 [00:00<?, ?it/s]

Using downloaded and verified file: /root/.medmnist/octmnist.npz
Using downloaded and verified file: /root/.medmnist/octmnist.npz


In [None]:
features = datasets.Features({
    #'name': datasets.Value('string'),
    'image': datasets.Image(),
    'labels': datasets.ClassLabel(num_classes=n_classes)
})

dataset_dict_train = {
    #'name': [],
    'image': [],
    'labels':[]
}
dataset_dict_val = {
    #'name': [],
    'image': [],
    'labels':[]
}
dataset_dict_test = {
    #'name': [],
    'image': [],
    'labels':[]
}

In [None]:
for i in range(len(train_dataset)):
  dataset_dict_train['image'].append(train_dataset[i][0])
  dataset_dict_train['labels'].append(train_dataset[i][1][0])

for i in range(len(val_dataset)):
  dataset_dict_val['image'].append(val_dataset[i][0])
  dataset_dict_val['labels'].append(val_dataset[i][1][0])

for i in range(len(test_dataset)):
  dataset_dict_test['image'].append(test_dataset[i][0])
  dataset_dict_test['labels'].append(test_dataset[i][1][0])

In [None]:
ds_train=datasets.Dataset.from_dict(dataset_dict_train, features)
ds_val=datasets.Dataset.from_dict(dataset_dict_val, features)
ds_test=datasets.Dataset.from_dict(dataset_dict_test, features)

In [None]:
ds=datasets.DatasetDict()
ds['train']=ds_train
ds['validation']=ds_val
ds['test']=ds_test
ds

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 97477
    })
    validation: Dataset({
        features: ['image', 'labels'],
        num_rows: 10832
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 1000
    })
})

In [None]:
import transformers
feature_extractor = transformers.BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

Downloading:   0%|          | 0.00/276 [00:00<?, ?B/s]

In [None]:
def transform(example_batch):
    data_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.repeat(3,1,1)) ,
    transforms.Normalize(mean=[.5], std=[.5])
])
    inputs={'pixel_values':[]}
    for x in example_batch['image']:
      inputs['pixel_values'].append(data_transform(x))


    # Don't forget to include the labels!
    inputs['pixel_values']=torch.stack(inputs['pixel_values'])
    inputs['labels'] = example_batch['labels']
    

    return inputs

In [None]:
prepared_ds = ds.with_transform(transform)
prepared_ds

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 97477
    })
    validation: Dataset({
        features: ['image', 'labels'],
        num_rows: 10832
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 1000
    })
})

In [None]:
prepared_ds['train'][0:2]

{'pixel_values': tensor([[[[-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039],
           [-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039],
           [-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039],
           ...,
           [-0.9216, -0.9216, -0.9216,  ..., -0.8902, -0.8902, -0.8902],
           [-0.9216, -0.9216, -0.9216,  ..., -0.8902, -0.8902, -0.8902],
           [-0.9216, -0.9216, -0.9216,  ..., -0.8902, -0.8902, -0.8902]],
 
          [[-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039],
           [-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039],
           [-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039],
           ...,
           [-0.9216, -0.9216, -0.9216,  ..., -0.8902, -0.8902, -0.8902],
           [-0.9216, -0.9216, -0.9216,  ..., -0.8902, -0.8902, -0.8902],
           [-0.9216, -0.9216, -0.9216,  ..., -0.8902, -0.8902, -0.8902]],
 
          [[-0.8196, -0.8196, -0.8196,  ..., -0.8039, -0.8039, -0.8039

In [None]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
labels = ds['train'].features['labels'].names
model = BeitForImageClassification.from_pretrained(
    "microsoft/beit-base-patch16-224",
    num_labels=n_classes,
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True
    )

Downloading:   0%|          | 0.00/69.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350M [00:00<?, ?B/s]

Some weights of BeitForImageClassification were not initialized from the model checkpoint at microsoft/beit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
  output_dir="/content/sample_data/BEIT/",
  per_device_train_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=2,#change
  fp16=True, # true for GPU
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=feature_extractor
)

Using cuda_amp half precision backend


In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 97477
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6094
  Number of trainable parameters = 85765060


Step,Training Loss,Validation Loss,Accuracy
100,1.1853,1.154356,0.482829
200,1.1322,1.249489,0.497415
300,1.0055,0.872944,0.688885
400,0.9581,0.795913,0.728951
500,0.7449,0.822344,0.720273
600,0.8636,0.741272,0.739014
700,0.7659,0.722313,0.743168
800,0.6445,0.673718,0.766987
900,0.7237,0.680363,0.768556
1000,0.6204,0.623584,0.783973


***** Running Evaluation *****
  Num examples = 10832
  Batch size = 8
Saving model checkpoint to /content/sample_data/BEIT/checkpoint-100
Configuration saved in /content/sample_data/BEIT/checkpoint-100/config.json
Model weights saved in /content/sample_data/BEIT/checkpoint-100/pytorch_model.bin
Image processor saved in /content/sample_data/BEIT/checkpoint-100/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 10832
  Batch size = 8
Saving model checkpoint to /content/sample_data/BEIT/checkpoint-200
Configuration saved in /content/sample_data/BEIT/checkpoint-200/config.json
Model weights saved in /content/sample_data/BEIT/checkpoint-200/pytorch_model.bin
Image processor saved in /content/sample_data/BEIT/checkpoint-200/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 10832
  Batch size = 8
Saving model checkpoint to /content/sample_data/BEIT/checkpoint-300
Configuration saved in /content/sample_data/BEIT/checkpoint-300/config.json
Model wei

***** train metrics *****
  epoch                    =           2.0
  total_flos               = 14064076432GF
  train_loss               =        0.5877
  train_runtime            =    2:12:43.90
  train_samples_per_second =         24.48
  train_steps_per_second   =         0.765


In [None]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


***** eval metrics *****
  epoch                   =        2.0
  eval_accuracy           =        0.7
  eval_loss               =     0.7697
  eval_runtime            = 0:00:07.89
  eval_samples_per_second =    126.693
  eval_steps_per_second   =     15.837
