# Initial Investigation of ViT
I'd like to take a look at ViT with cross-entropy loss before combining it with the AUC loss. We will still use the libAUC data set construction, and much of the setup code can be found here: https://github.com/Optimization-AI/LibAUC/blob/main/examples/05_Optimizing_AUROC_Loss_with_DenseNet121_on_CheXpert.ipynb 

In [1]:
# imports
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import torch
print(torch.cuda.is_available())

from transformers import ViTFeatureExtractor, ViTForImageClassification
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

True


In [2]:
from dataloader import *
from utils import *
from trainer import *

In [3]:
# CONSTANTS
VIT_MODEL_NAME = 'google/vit-large-patch16-224-in21k'
TRAIN_SPLIT = 0.8
BATCH_SIZE = 24
LEARNING_RATE = 0.01
FP16 = True
EPOCHS = 1
EVALUATION_STRATEGY = "steps"
EVAL_STEPS = 100
OUTPUT_DIR = './vit-large-chexpert'
REMOVE_UNUSED_COLUMNS = False
GRAD_ACCUM_STEPS = 10

In [4]:
# set up the ViT model
feature_extractor = ViTFeatureExtractor.from_pretrained(VIT_MODEL_NAME, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225])

In [7]:
# set up the dataset
np.random.seed(42)
train_df = pd.read_csv("ChexPert/train.csv")
train_df, eval_df = train_test_split(train_df, train_size=TRAIN_SPLIT)

train_dataset = ChexpertViTDataset("ChexPert/data", train_df, feature_extractor, uncertainty_method="smooth", smoothing_lower_bound=0.55, smoothing_upper_bound=0.85)
eval_dataset = ChexpertViTDataset("ChexPert/data", eval_df, feature_extractor)

In [8]:
model = ViTForImageClassification.from_pretrained(
    VIT_MODEL_NAME,
    num_labels=len(train_dataset.labels)
).to("cuda")

Some weights of the model checkpoint at google/vit-large-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# set up training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy=EVALUATION_STRATEGY,
    num_train_epochs=EPOCHS,
    fp16=FP16,
    eval_steps = EVAL_STEPS,
    learning_rate=LEARNING_RATE,
    remove_unused_columns=REMOVE_UNUSED_COLUMNS,
    report_to="tensorboard",
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
)

In [10]:
trainer = CETrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

Using amp half precision backend


In [11]:
trainer.train()

***** Running training *****
  Num examples = 152878
  Num Epochs = 1
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 240
  Gradient Accumulation steps = 10
  Total optimization steps = 637


Step,Training Loss,Validation Loss,Accuracy
100,No log,1.980073,0.414061
200,No log,1.979822,0.414061
300,No log,1.978752,0.414061
400,No log,1.972519,0.414061
500,2.056300,1.970492,0.414061
600,2.056300,1.97419,0.414061


***** Running Evaluation *****
  Num examples = 38149
  Batch size = 24
***** Running Evaluation *****
  Num examples = 38149
  Batch size = 24
***** Running Evaluation *****
  Num examples = 38149
  Batch size = 24
***** Running Evaluation *****
  Num examples = 38149
  Batch size = 24
***** Running Evaluation *****
  Num examples = 38149
  Batch size = 24
Saving model checkpoint to ./vit-large-chexpert/checkpoint-500
Configuration saved in ./vit-large-chexpert/checkpoint-500/config.json
Model weights saved in ./vit-large-chexpert/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 38149
  Batch size = 24


KeyboardInterrupt: 

In [13]:
preds = trainer.predict(torch.utils.data.Subset(eval_dataset, list(range(16*10))))

***** Running Prediction *****
  Num examples = 160
  Batch size = 24


In [14]:
compute_metrics(preds)

{'accuracy': 0.41875}

In [18]:
preds.predictions[:5]

array([[ 0.3684, -0.313 ,  0.471 ,  1.905 , -1.6045,  0.4326, -0.7607,
        -2.455 , -0.3289, -0.628 , -0.5044, -3.395 , -1.937 , -0.904 ],
       [ 0.3682, -0.3127,  0.4712,  1.904 , -1.6045,  0.4329, -0.7607,
        -2.455 , -0.3286, -0.6274, -0.505 , -3.395 , -1.937 , -0.9033],
       [ 0.3684, -0.313 ,  0.471 ,  1.905 , -1.6045,  0.4329, -0.7607,
        -2.455 , -0.3289, -0.628 , -0.505 , -3.395 , -1.937 , -0.904 ],
       [ 0.3684, -0.313 ,  0.4712,  1.905 , -1.6045,  0.4329, -0.7607,
        -2.455 , -0.3289, -0.6274, -0.505 , -3.395 , -1.937 , -0.904 ],
       [ 0.3684, -0.3127,  0.4712,  1.905 , -1.6045,  0.4329, -0.7607,
        -2.455 , -0.3289, -0.6274, -0.505 , -3.395 , -1.937 , -0.9033]],
      dtype=float16)

In [19]:
eval_dataset[0]
eval_dataset[1]

{'pixel_values': tensor([[[-1.7754, -1.7754, -1.7754,  ..., -1.7412, -1.7069, -1.7240],
         [-1.8097, -1.8268, -1.8097,  ..., -1.7583, -1.7412, -1.7240],
         [-1.8268, -1.8097, -1.7754,  ..., -1.7754, -1.7583, -1.7412],
         ...,
         [ 0.1768,  0.1939,  0.2967,  ..., -1.9809, -1.9809, -1.9638],
         [ 0.2796,  0.2967,  0.3309,  ..., -1.9809, -1.9809, -1.9638],
         [ 0.3481,  0.3138,  0.4337,  ..., -1.9809, -1.9809, -1.9809]],

        [[-1.6856, -1.6856, -1.6856,  ..., -1.6506, -1.6155, -1.6331],
         [-1.7206, -1.7381, -1.7206,  ..., -1.6681, -1.6506, -1.6331],
         [-1.7381, -1.7206, -1.6856,  ..., -1.6856, -1.6681, -1.6506],
         ...,
         [ 0.3102,  0.3277,  0.4328,  ..., -1.8957, -1.8957, -1.8782],
         [ 0.4153,  0.4328,  0.4678,  ..., -1.8957, -1.8957, -1.8782],
         [ 0.4853,  0.4503,  0.5728,  ..., -1.8957, -1.8957, -1.8957]],

        [[-1.4559, -1.4559, -1.4559,  ..., -1.4210, -1.3861, -1.4036],
         [-1.4907, -1.5081, 

In [30]:
model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=102

In [21]:
trainer.save_state()

In [22]:
trainer.save_model()

Saving model checkpoint to ./vit-large-chexpert
Configuration saved in ./vit-large-chexpert/config.json
Model weights saved in ./vit-large-chexpert/pytorch_model.bin
