# Google Drive Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
# NOTE: To be able to access the shared files, you need to go to Drive and click
# "Add shortcut to Drive" on the options for the shared folder to be able to access it when mounted

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Paths to the "Project Code" folder
path_andrew = "/content/gdrive/MyDrive/CS 7643/Project/Project Code"
path_gillian = "/content/gdrive/MyDrive/CS 7643/Project Code"
path_blake = "/content/gdrive/MyDrive/CS 7643/Project Code"
path_isaac = "/content/gdrive/MyDrive/classes/CS 7643/Project Code"
path_bryan = "/content/gdrive/MyDrive/Georgia Tech/CS 7643/CS 7643/Project Code"

In [None]:
# change this to your path when running
path = path_isaac
%cd {path}

/content/gdrive/.shortcut-targets-by-id/1lmYZ9dBUQsLKZApoHNFRjbtE_VA1kGF4/CS 7643/Project Code


# Imports/Setup

In [None]:
# pip installs
!pip install transformers
!pip install torcheval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torcheval.metrics.functional import binary_auroc

# data
from HMDataset import HMDataset, HMDataset_H5

# model
from transformers import ViltProcessor, ViltModel
from ClassificationHead import ViltHead

# general
from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# gpu check
print(torch.__version__)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device=='cuda')

2.0.0+cu118
True


# Load Data

In [None]:
# Hyperparameters
batch_size = 64
lr = 1e-4
momentum = 0.9
weight_decay = 1e-5
epochs = 10
dropout = 0.1

output_file_name = "vilt_head"

In [None]:
# create PyTorch dataset from the given files/directories
# make sure paths are correct

# each sample has keys: "id", "image", "label", and "text"
transform_h5 = transforms.ToTensor() # everything in HDF5 is already pre-resized to 256 x 256

train_dataset = HMDataset_H5(h5_file=path + "/training_db.h5", transform=transform_h5)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)

# validation
validate_dataset = HMDataset_H5(h5_file=path + "/validation_db.h5", transform=transform_h5)
validate_dataloader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)

# testing
test_dataset = HMDataset_H5(h5_file=path + "/test_db.h5", transform=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)


In [None]:
def get_metrics_and_losses(losses, predictions, labels):
  average_loss = losses.mean().item()
  accuracy = (predictions == labels).sum().item() / labels.numel()
  auroc = binary_auroc(predictions, labels)

  return average_loss, accuracy, auroc

# VILT Model

In [None]:
# Get Pretrained Model

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")


     
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")
print(model.config)

for param in model.parameters():
    param.requires_grad = False

model.to(device)

Some weights of the model checkpoint at dandelin/vilt-b32-mlm were not used when initializing ViltModel: ['mlm_score.transform.LayerNorm.bias', 'mlm_score.bias', 'mlm_score.decoder.weight', 'mlm_score.transform.LayerNorm.weight', 'mlm_score.transform.dense.bias', 'mlm_score.transform.dense.weight']
- This IS expected if you are initializing ViltModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViltModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ViltConfig {
  "_name_or_path": "dandelin/vilt-b32-mlm",
  "architectures": [
    "ViltForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_image_length": -1,
  "max_position_embeddings": 40,
  "modality_type_vocab_size": 2,
  "model_type": "vilt",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "num_images": -1,
  "patch_size": 32,
  "qkv_bias": true,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}



ViltModel(
  (embeddings): ViltEmbeddings(
    (text_embeddings): TextEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(40, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (patch_embeddings): ViltPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
    )
    (token_type_embeddings): Embedding(2, 768)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViltEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViltLayer(
        (attention): ViltAttention(
          (attention): ViltSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=Fa

In [None]:
head = ViltHead(dropout=dropout)
head.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(head.parameters(), lr=lr, weight_decay=weight_decay)

## Train Model

In [None]:
max_validation_auroc = 0
for e in range(epochs):

  # training
  train_losses = torch.zeros(len(train_dataloader)).to(device)
  train_predictions = torch.Tensor().to(device)
  train_labels = torch.Tensor().to(device)

  model.train()
  head.train()
  for i, data in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Epoch " + str(e+1) + " Training"):

    # get inputs
    text = data['text']
    image = list(data['image'].detach().numpy())
    labels = data['label'].to(device).to(torch.int64)

    # zero the parameter gradients
    optimizer.zero_grad()

    # VILT base model
    inputs = processor(image, text, return_tensors="pt", padding=True, truncation=True).to(device)
    bt_outputs = model(**inputs)
    pooled_outputs = bt_outputs["pooler_output"]

    # classification head
    scores = torch.squeeze(head(pooled_outputs))
    predictions = scores.argmax(dim=-1)

    # backprop
    loss = criterion(scores, labels)
    loss.backward()
    optimizer.step()

    # track values for metric logging
    train_losses[i] = loss
    train_predictions = torch.cat((train_predictions, predictions))
    train_labels = torch.cat((train_labels, labels))

    train_average_loss, train_accuracy, train_auroc = get_metrics_and_losses(train_losses, train_predictions, train_labels)

  # validation
  validate_losses = torch.zeros(len(validate_dataloader)).to(device)
  validate_predictions = torch.Tensor().to(device)
  validate_labels = torch.Tensor().to(device)
  model.eval()
  head.eval()
  with torch.no_grad():
    for i, data in tqdm(enumerate(validate_dataloader), total=len(validate_dataloader), desc="Epoch " + str(e+1) + " Training"):

      # get inputs
      text = data['text']
      image = list(data['image'].detach().numpy())
      labels = data['label'].to(device).to(torch.int64)

      # VILT base model
      inputs = processor(image, text, return_tensors="pt", padding=True, truncation=True).to(device)
      bt_outputs = model(**inputs)
      pooled_outputs = bt_outputs["pooler_output"]
      # classification head
      scores = torch.squeeze(head(pooled_outputs))
      predictions = scores.argmax(dim=-1)

      # loss
      loss = criterion(scores, labels)

      # track values for metric logging
      validate_losses[i] = loss
      validate_predictions = torch.cat((validate_predictions, predictions), dim=0)
      validate_labels = torch.cat((validate_labels, labels), dim=0)
      

  validate_average_loss, validate_accuracy, validate_auroc = get_metrics_and_losses(validate_losses, validate_predictions, validate_labels)
  if validate_auroc > max_validation_auroc:
    max_validation_auroc = validate_auroc
    torch.save(head.state_dict(), "heads/"+output_file_name+".pt")
    print("\nNew head saved at epoch " + str(e+1))
    
  print("Epoch %d" % (e+1))
  print("Training Loss: %.4f. Validation Loss: %.4f. " % (train_average_loss, validate_average_loss))
  print("Training Accuracy: %.4f. Validation Accuracy: %.4f. " % (train_accuracy, validate_accuracy))
  print("Training AUROC: %.4f. Validation AUROC: %.4f. " % (train_auroc, validate_auroc))
  print("-----------------------------------")

Epoch 1 Training: 100%|██████████| 132/132 [04:32<00:00,  2.07s/it]
Epoch 1 Training: 100%|██████████| 7/7 [00:13<00:00,  1.93s/it]



New head saved at epoch 1
Epoch 1
Training Loss: 0.6300. Validation Loss: 0.7973. 
Training Accuracy: 0.6481. Validation Accuracy: 0.5089. 
Training AUROC: 0.5107. Validation AUROC: 0.5023. 
-----------------------------------


Epoch 2 Training: 100%|██████████| 132/132 [04:12<00:00,  1.92s/it]
Epoch 2 Training: 100%|██████████| 7/7 [00:13<00:00,  1.96s/it]



New head saved at epoch 2
Epoch 2
Training Loss: 0.6012. Validation Loss: 0.6969. 
Training Accuracy: 0.6732. Validation Accuracy: 0.5379. 
Training AUROC: 0.5715. Validation AUROC: 0.5324. 
-----------------------------------


Epoch 3 Training: 100%|██████████| 132/132 [04:13<00:00,  1.92s/it]
Epoch 3 Training: 100%|██████████| 7/7 [00:13<00:00,  1.97s/it]



New head saved at epoch 3
Epoch 3
Training Loss: 0.5879. Validation Loss: 0.6936. 
Training Accuracy: 0.6913. Validation Accuracy: 0.5446. 
Training AUROC: 0.6137. Validation AUROC: 0.5426. 
-----------------------------------


Epoch 4 Training: 100%|██████████| 132/132 [04:11<00:00,  1.90s/it]
Epoch 4 Training: 100%|██████████| 7/7 [00:13<00:00,  1.92s/it]



New head saved at epoch 4
Epoch 4
Training Loss: 0.5759. Validation Loss: 0.6697. 
Training Accuracy: 0.7016. Validation Accuracy: 0.5737. 
Training AUROC: 0.6327. Validation AUROC: 0.5709. 
-----------------------------------


Epoch 5 Training: 100%|██████████| 132/132 [04:11<00:00,  1.90s/it]
Epoch 5 Training: 100%|██████████| 7/7 [00:13<00:00,  1.96s/it]


Epoch 5
Training Loss: 0.5714. Validation Loss: 0.7477. 
Training Accuracy: 0.7049. Validation Accuracy: 0.5402. 
Training AUROC: 0.6407. Validation AUROC: 0.5342. 
-----------------------------------


Epoch 6 Training: 100%|██████████| 132/132 [04:06<00:00,  1.87s/it]
Epoch 6 Training: 100%|██████████| 7/7 [00:13<00:00,  1.91s/it]


Epoch 6
Training Loss: 0.5607. Validation Loss: 0.7038. 
Training Accuracy: 0.7135. Validation Accuracy: 0.5692. 
Training AUROC: 0.6535. Validation AUROC: 0.5660. 
-----------------------------------


Epoch 7 Training: 100%|██████████| 132/132 [04:02<00:00,  1.84s/it]
Epoch 7 Training: 100%|██████████| 7/7 [00:12<00:00,  1.85s/it]


Epoch 7
Training Loss: 0.5509. Validation Loss: 0.7198. 
Training Accuracy: 0.7191. Validation Accuracy: 0.5692. 
Training AUROC: 0.6628. Validation AUROC: 0.5668. 
-----------------------------------


Epoch 8 Training: 100%|██████████| 132/132 [04:02<00:00,  1.84s/it]
Epoch 8 Training: 100%|██████████| 7/7 [00:12<00:00,  1.85s/it]


Epoch 8
Training Loss: 0.5455. Validation Loss: 0.7241. 
Training Accuracy: 0.7249. Validation Accuracy: 0.5536. 
Training AUROC: 0.6697. Validation AUROC: 0.5536. 
-----------------------------------


Epoch 9 Training: 100%|██████████| 132/132 [04:04<00:00,  1.86s/it]
Epoch 9 Training: 100%|██████████| 7/7 [00:13<00:00,  1.90s/it]


Epoch 9
Training Loss: 0.5375. Validation Loss: 0.8332. 
Training Accuracy: 0.7314. Validation Accuracy: 0.5692. 
Training AUROC: 0.6793. Validation AUROC: 0.5581. 
-----------------------------------


Epoch 10 Training: 100%|██████████| 132/132 [04:07<00:00,  1.87s/it]
Epoch 10 Training: 100%|██████████| 7/7 [00:13<00:00,  1.92s/it]


Epoch 10
Training Loss: 0.5340. Validation Loss: 0.6895. 
Training Accuracy: 0.7320. Validation Accuracy: 0.5781. 
Training AUROC: 0.6796. Validation AUROC: 0.5641. 
-----------------------------------


In [None]:
def run_model(model, processor, head, criterion, text, image, labels):

    # model
    inputs = processor(text=text, images=image, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model(**inputs)

    # head
    pooled_outputs = bt_outputs["pooler_output"]
    # classification head
    scores = torch.squeeze(head(pooled_outputs))
    predictions = scores.argmax(dim=-1)
    # loss
    loss = criterion(scores, labels)
    return loss, predictions

# test
def test(model, processor, head, criterion, test_dataloader):
    model.eval()
    best_head.eval()
    test_losses = torch.zeros(len(test_dataloader)).to(device)
    test_predictions = torch.Tensor().to(device)
    test_labels = torch.Tensor().to(device)
    with torch.no_grad():
        for i,data in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc="Test", position=0, leave=True):
            # get inputs
            text = data['text']
            image = data['image'].to(device)
            labels = data['label'].to(device).to(torch.int64)
            
            # run model
            loss, predictions = run_model(model, processor, head, criterion, text, image, labels)

            # track values for metric logging
            test_losses[i] = loss
            test_predictions = torch.cat((test_predictions, predictions), dim=0)
            test_labels = torch.cat((test_labels, labels), dim=0)
        
    # log loss and metrics
    test_average_loss, test_accuracy, test_auroc = get_metrics_and_losses(test_losses, test_predictions, test_labels)
    print("\n     Testing Loss: %.4f. " % (test_average_loss))
    print("     Testing Accuracy: %.4f. " % (test_accuracy))
    print("     Testing AUROC: %.4f. " % (test_auroc))

    return test_average_loss, test_accuracy, test_auroc

# load best head
best_head = ViltHead(dropout=dropout)
best_head.load_state_dict(torch.load('heads/'+output_file_name+".pt"))
best_head.to(device)

# test
test(model, processor, best_head, criterion, test_dataloader)

Test:  12%|█▎        | 1/8 [00:03<00:24,  3.52s/it]