In [2]:
# dependencies
from datasets import load_dataset
from transformers import AutoModel
import torchvision.transforms as transforms
from torchvision.io import read_image
from transformers import BeitFeatureExtractor, BeitForImageClassification, Trainer
import numpy as np
from datasets import Features, ClassLabel, Array3D, Image
import torch
from torch import nn, tensor
import os
from torchmetrics import Accuracy, MatthewsCorrCoef
import matplotlib.pyplot as plt


os.environ["HF_ENDPOINT"] = "https://huggingface.co"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer, AutoModelForPreTraining
model = AutoModelForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

"""
@article{DBLP:journals/corr/abs-2005-04790,
  author    = {Douwe Kiela and
               Hamed Firooz and
               Aravind Mohan and
               Vedanuj Goswami and
               Amanpreet Singh and
               Pratik Ringshia and
               Davide Testuggine},
  title     = {The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes},
  journal   = {CoRR},
  volume    = {abs/2005.04790},
  year      = {2020},
  url       = {https://arxiv.org/abs/2005.04790},
  eprinttype = {arXiv},
  eprint    = {2005.04790},
  timestamp = {Thu, 14 May 2020 16:56:02 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2005-04790.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
train_data = load_dataset("Multimodal-Fatima/Hatefulmemes_train")
test_data = load_dataset("Multimodal-Fatima/Hatefulmemes_test")



Found cached dataset parquet (/home/benjamin/.cache/huggingface/datasets/Multimodal-Fatima___parquet/Multimodal-Fatima--Hatefulmemes_train-cf2bb543f5aaeaee/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 443.89it/s]
Found cached dataset parquet (/home/benjamin/.cache/huggingface/datasets/Multimodal-Fatima___parquet/Multimodal-Fatima--Hatefulmemes_test-c1760e361ffe8410/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 857.73it/s]


In [7]:
# transform the image for input
my_transforms = transforms.Compose([                                                                                                                                                                                                
transforms.Resize((3, 224, 224)),                                                                                                  
transforms.ToTensor()                                                                                                           
])

In [6]:
from transformers import ViltProcessor, ViltModel
from PIL import Image
import requests

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")


Some weights of the model checkpoint at dandelin/vilt-b32-mlm were not used when initializing ViltModel: ['mlm_score.transform.LayerNorm.bias', 'mlm_score.decoder.weight', 'mlm_score.transform.LayerNorm.weight', 'mlm_score.transform.dense.bias', 'mlm_score.bias', 'mlm_score.transform.dense.weight']
- This IS expected if you are initializing ViltModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViltModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
image = train_data['train'][0:3]['image']
image = my_transforms(image)
text = train_data['train'][0:3]['text']


TypeError: Unexpected type <class 'list'>

In [11]:
inputs = processor(image, text, return_tensors="pt")
out = model(**inputs)

In [26]:
"""
Here is the code for adding a classification layer to the VilBert model
"""
class classificationVILT(torch.nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.projection = nn.Linear(768, 2)
        self.classification = nn.Softmax(dim=1)
    
    def forward(self, input_ids, token_type_ids, attention_mask, pixel_values, pixel_mask):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, pixel_values, pixel_mask)
        pooled_output = outputs[1]
        to_feed = self.projection(pooled_output)
        logits = self.classification(to_feed)
        return logits
        

print(outputs[0].shape)
print()

In [None]:

class CustomTrainer(Trainer):
    def __init__(self, epochs, lr, train_data, test_data, model, processor):
        self.epochs = epochs
        self.lr = lr
        self.train_data = train_data
        self.test_data = test_data
        self.model = model
        self.processor = processor
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.BCELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

    def plot(self, loss):
        timesteps = np.arange(1, loss.shape[0] + 1)
        # Plot the MSE vs timesteps
        plt.plot(timesteps, loss)
        # Add axis labels and a title
        plt.xlabel('Timestep')
        plt.ylabel('Mean Squared Error')
        plt.title('Mean Squared Error over Timesteps')
        # Show the plot
        plt.show()

    def train(self, batch_size):
        adam = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        cosine = torch.optim.lr_scheduler.CosineAnnealingLR(adam, self.epochs)
        loss_fct = nn.BCELoss()
        accuracy = Accuracy(task='multiclass', num_classes=2).to(device)
        mcc = MatthewsCorrCoef(task='binary').to(device)
        counter = 0

        training_loss_over_epochs = []
        for epoch in range(self.epochs):
            training_loss = []
            total_acc = 0
            num_train_steps = train_data['train'].num_rows
            for train_index in range(num_train_steps):
                model.zero_grad()
                image = train_data['train'][train_index]['image']
                text = train_data['train'][train_index]['text']
                inputs = processor(image, text, return_tensors="pt")
                out = model(**inputs)
                if(train_data['train'][train_index]['label'] == 0):
                    truth = torch.tensor([0, train_data['train'][train_index]['label']], dtype=float)
                else: 
                    truth = torch.tensor([1, train_data['train'][train_index]['label']], dtype=float)
                loss = loss_fct(out.float(), truth.view(1, 2).float())
                training_loss.append(loss.item())
                maximums = torch.argmax(out)
                truth_max = torch.argmax(truth)
                # here is the accuracy measurement
                #acc = accuracy(maximums, truth_max)
                total_acc += (maximums == truth_max)
                adam.zero_grad()
                loss.backward()
                adam.step()
            self.plot(training_loss)
            print('\n')
            print('epoch: ', counter)
            counter += 1
            print('training set accuracy: ', total_acc/train_index)
            # the average matthews correlation coefficient?
            #print('matthews correlation coefficient', total_mc/train_index)
                # the total matthews correlation coefficient
            #print('total matthews correlation coefficient', total_mc)
            print('loss total: ', sum(training_loss))
            print('\n')
            training_loss_over_epochs.append(training_loss)
            #exponential.step()
            cosine.step()

In [71]:
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
vilt = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")


Some weights of the model checkpoint at dandelin/vilt-b32-mlm were not used when initializing ViltModel: ['mlm_score.transform.dense.weight', 'mlm_score.bias', 'mlm_score.transform.LayerNorm.weight', 'mlm_score.transform.LayerNorm.bias', 'mlm_score.transform.dense.bias', 'mlm_score.decoder.weight']
- This IS expected if you are initializing ViltModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViltModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
model =  classificationVILT(vilt)
inputs = processor(image, text, return_tensors="pt")
out = model(**inputs)

NameError: name 'classificationVILT' is not defined

In [None]:
train = CustomTrainer(10, 1e-3, train_data, test_data, model, processor)
train.train(1)