# ViT 2: Electric Transformaroo
We learned a few lessons from the previous experiment. First, and primarily, a larger model doesn't always mean better. We used the large imageNet transformer model, but that didn't actually lead to good predictions. Also, I think I screwed up a bunch of stuff, and the large model is taking way too long to train for me to iterate at all. We are going to switch to a smaller model. We are going to follow the ViTMAE method, use masked auto encodings for pretraining in a self-supervised manner on the entire training set, then fine tune on the prediction task. Guide found here: https://github.com/huggingface/transformers/blob/main/examples/pytorch/image-pretraining/run_mae.py 

This notebook has the fine-tuning process, fine tuning on Binary CE Loss.

In [1]:
# imports
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import torch
print(torch.cuda.is_available())

from transformers import ViTFeatureExtractor, ViTForImageClassification, ViTMAEForPreTraining, ViTMAEConfig
from transformers import TrainingArguments, Trainer
from torchvision.transforms import RandomHorizontalFlip, RandomResizedCrop
from torchvision.transforms.functional import InterpolationMode

from sklearn.model_selection import train_test_split

True


In [2]:
from dataloader import *
from utils import *
from trainer import *

In [12]:
# CONSTANTS
FEATURE_EXTRACTOR_NAME = 'facebook/vit-mae-base'
VIT_MODEL_NAME = 'vit-mae-chexpert-fandl-pretrain/'
TRAIN_SPLIT = 0.8
BATCH_SIZE = 48
LEARNING_RATE = 1.5e-3
LR_SCHEDULER_TYPE = "cosine"
WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.05
LOGGING_STRATEGY = "steps"
LOGGING_STEPS = 10
FP16 = True
EPOCHS = 13
EVALUATION_STRATEGY = "epoch"
EVAL_STEPS = 200
OUTPUT_DIR = './vit-mae-chexpert-fine-tuned-fandl'
REMOVE_UNUSED_COLUMNS = False
GRAD_ACCUM_STEPS = 5
DATALOADER_NUM_WORKERS = 4
SAVE_TOTAL_LIMIT = 3

In [4]:
feature_extractor = ViTFeatureExtractor.from_pretrained(FEATURE_EXTRACTOR_NAME, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225])

In [5]:
# set up our transforms
transforms = [
    RandomResizedCrop(feature_extractor.size, scale=(0.2, 1.0), interpolation=InterpolationMode.BICUBIC),
    RandomHorizontalFlip(),
]

In [6]:
np.random.seed(42)
train_df = pd.read_csv("ChexPert/train.csv")
train_df, eval_df = train_test_split(train_df, train_size=TRAIN_SPLIT)

train_dataset = ChexpertViTDataset("ChexPert/data", train_df, feature_extractor, use_frontal=False, transforms=transforms, classes=COMPETITION_TASKS,
    uncertainty_method="smooth", smoothing_lower_bound=0.55, smoothing_upper_bound=0.85)
eval_dataset = ChexpertViTDataset("ChexPert/data", eval_df, feature_extractor, use_frontal=False, classes=COMPETITION_TASKS,
    uncertainty_method="smooth", smoothing_lower_bound=0.55, smoothing_upper_bound=0.85)

In [7]:
model = ViTForImageClassification.from_pretrained(
    VIT_MODEL_NAME,
    num_labels=len(train_dataset.labels)
).to("cuda")

You are using a model of type vit_mae to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vit-mae-chexpert-fandl-pretrain/ were not used when initializing ViTForImageClassification: ['decoder.decoder_layers.0.attention.output.dense.weight', 'decoder.decoder_layers.3.output.dense.bias', 'decoder.decoder_layers.4.layernorm_after.bias', 'decoder.decoder_pos_embed', 'decoder.decoder_layers.3.layernorm_before.weight', 'decoder.decoder_layers.2.attention.output.dense.bias', 'decoder.decoder_layers.0.attention.attention.value.weight', 'decoder.decoder_layers.1.output.dense.bias', 'decoder.decoder_layers.3.attention.attention.value.weight', 'decoder.decoder_layers.5.layernorm_after.bias', 'decoder.decoder_layers.4.intermediate.dense.weight', 'decoder.decoder_layers.5.attention.attention.query.bias', 'decoder.decoder_layers.2.attention.attention.query.bias', 'decoder.decoder_layers.6.attention.a

In [13]:
# set up training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy=EVALUATION_STRATEGY,
    num_train_epochs=EPOCHS,
    fp16=FP16,
    eval_steps = EVAL_STEPS,
    learning_rate=LEARNING_RATE,
    remove_unused_columns=REMOVE_UNUSED_COLUMNS,
    report_to="tensorboard",
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    logging_strategy=LOGGING_STRATEGY,
    logging_steps=LOGGING_STEPS,
    dataloader_num_workers=DATALOADER_NUM_WORKERS,
    save_total_limit=SAVE_TOTAL_LIMIT
)

PyTorch: setting up devices


In [14]:
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

Using amp half precision backend


In [15]:
trainer.train()

***** Running training *****
  Num examples = 178731
  Num Epochs = 13
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 240
  Gradient Accumulation steps = 5
  Total optimization steps = 9672


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc Atelectasis,Auc Cardiomegaly,Auc Consolidation,Auc Edema,Auc Pleural effusion,Average Auc
0,0.4802,0.473182,0.310856,0.335015,0.651195,0.736482,0.643851,0.785461,0.800699,0.723538
1,0.4761,0.459946,0.325515,0.39985,0.661748,0.775297,0.648997,0.792817,0.807409,0.737254
2,0.4632,0.456023,0.32735,0.393696,0.664241,0.781878,0.654788,0.800859,0.813588,0.743071
3,0.4662,0.448977,0.339189,0.442814,0.674543,0.795346,0.659463,0.807377,0.826896,0.752725
4,0.4592,0.450398,0.330775,0.401034,0.670294,0.79412,0.658249,0.810207,0.822499,0.751074
5,0.4514,0.444509,0.341405,0.457183,0.68236,0.799366,0.668505,0.816394,0.833415,0.760008
6,0.4617,0.444326,0.34333,0.450961,0.681936,0.807355,0.667294,0.81746,0.839051,0.762619
7,0.4493,0.433599,0.353222,0.485566,0.687598,0.817566,0.67575,0.824565,0.848907,0.770877
8,0.4308,0.431541,0.355101,0.483373,0.691355,0.82133,0.676633,0.82707,0.852204,0.773718
9,0.4411,0.429071,0.353356,0.478546,0.692798,0.826629,0.680027,0.832358,0.852328,0.776828


Saving model checkpoint to ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-500
Configuration saved in ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-500/config.json
Model weights saved in ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 44683
  Batch size = 48
Saving model checkpoint to ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-1000
Configuration saved in ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-1000/config.json
Model weights saved in ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 44683
  Batch size = 48
Saving model checkpoint to ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-1500
Configuration saved in ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-1500/config.json
Model weights saved in ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./vit-mae-chexpert-fine-tuned-fandl/checkpoint-2000
Con

TrainOutput(global_step=9672, training_loss=0.45264704008428785, metrics={'train_runtime': 34625.23, 'train_samples_per_second': 67.104, 'train_steps_per_second': 0.279, 'total_flos': 1.8004446012972938e+20, 'train_loss': 0.45264704008428785, 'epoch': 13.0})

In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 44683
  Batch size = 48


{'eval_loss': 0.42379119992256165,
 'eval_accuracy': 0.3626435109549493,
 'eval_f1': 0.5002567833604381,
 'eval_AUC_Atelectasis': 0.6984251795789835,
 'eval_AUC_Cardiomegaly': 0.8324510381556287,
 'eval_AUC_Consolidation': 0.6858232633242294,
 'eval_AUC_Edema': 0.8365593918018425,
 'eval_AUC_Pleural Effusion': 0.8597687737102451,
 'eval_average_auc': 0.7826055293141858,
 'eval_runtime': 217.7972,
 'eval_samples_per_second': 205.159,
 'eval_steps_per_second': 4.275,
 'epoch': 13.0}

In [None]:
trainer.save_state()

In [17]:
trainer.save_model()

Saving model checkpoint to ./vit-mae-chexpert-fine-tuned-fandl
Configuration saved in ./vit-mae-chexpert-fine-tuned-fandl/config.json
Model weights saved in ./vit-mae-chexpert-fine-tuned-fandl/pytorch_model.bin


In [7]:
trainer = MultiLabelTrainer(
    model=ViTForImageClassification.from_pretrained(OUTPUT_DIR).to("cuda"),
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

Using amp half precision backend


In [18]:
validation_df = pd.read_csv("ChexPert/valid.csv")
valid_dataset = ChexpertViTDataset("ChexPert/data", validation_df, feature_extractor, use_frontal=False, classes=COMPETITION_TASKS,
    uncertainty_method="smooth", smoothing_lower_bound=0.55, smoothing_upper_bound=0.85)

In [19]:
trainer.evaluate(eval_dataset=valid_dataset)

***** Running Evaluation *****
  Num examples = 234
  Batch size = 48


{'eval_loss': 0.4152001142501831,
 'eval_accuracy': 0.44871794871794873,
 'eval_f1': 0.4117647058823529,
 'eval_AUC_Atelectasis': 0.7994318181818182,
 'eval_AUC_Cardiomegaly': 0.8184798015591779,
 'eval_AUC_Consolidation': 0.9042665460575908,
 'eval_AUC_Edema': 0.8994708994708995,
 'eval_AUC_Pleural Effusion': 0.9272499776566271,
 'eval_average_auc': 0.8697798085852227,
 'eval_runtime': 2.1056,
 'eval_samples_per_second': 111.13,
 'eval_steps_per_second': 2.375,
 'epoch': 13.0}

In [18]:
 trainer.evaluate(eval_dataset=valid_dataset)

***** Running Evaluation *****
  Num examples = 202
  Batch size = 48


{'eval_loss': 0.46156764030456543,
 'eval_accuracy': 0.4207920792079208,
 'eval_f1': 0.4159999999999999,
 'eval_AUC_Atelectasis': 0.8206299212598426,
 'eval_AUC_Cardiomegaly': 0.8019719251336899,
 'eval_AUC_Consolidation': 0.8797794117647059,
 'eval_AUC_Edema': 0.8944940476190476,
 'eval_AUC_Pleural Effusion': 0.9164968297101449,
 'eval_average_auc': 0.8626744270974862,
 'eval_runtime': 2.3178,
 'eval_samples_per_second': 87.152,
 'eval_steps_per_second': 2.157,
 'epoch': 3.0}