In [1]:
!nvidia-smi

Mon Jul 10 14:05:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

! unzip -q '/content/drive/MyDrive/hateful_memes.zip' -d '/content/data'

Mounted at /content/drive


In [3]:
! pip install --quiet ftfy regex tqdm
! pip install --quiet git+https://github.com/openai/CLIP.git
! pip install --quiet transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.image as img

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision
from torchvision import transforms as T
import torchvision.transforms.functional as F

import cv2
import clip
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
from transformers import AutoTokenizer, AutoModel

In [5]:
class ScaleMaxSideToSize(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    sample = cv2.resize(sample, (self.size, self.size), interpolation=cv2.INTER_AREA)

    return sample


class CropCenter(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    img = sample
    h, w, _ = img.shape
    margin_h = (h - self.size) // 2
    margin_w = (w - self.size) // 2
    sample = img[margin_h:margin_h + self.size, margin_w:margin_w + self.size]

    return sample

## 1. Load dataset

In [6]:
class Load_Dataset(torch.utils.data.Dataset):
  def __init__(self, data_path, transforms):
    self.data = [json.loads(l) for l in open(data_path)]
    self.data_dir = os.path.dirname(data_path)
    self.transforms = transforms

  def __getitem__(self, index: int):
    path = os.path.join(self.data_dir, self.data[index]["img"])
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    text = self.data[index]["text"]
    label = self.data[index]["label"]

    if self.transforms is not None:
        image = self.transforms(image)

    return image, text, label

  def __len__(self):
    return len(self.data)

In [7]:
CROP_SIZE = 224
MEAN = torch.tensor([0.485, 0.456, 0.406])
STD = torch.tensor([0.229, 0.224, 0.225])

transforms = T.Compose([
    ScaleMaxSideToSize(CROP_SIZE),
    CropCenter(CROP_SIZE),
    T.ToTensor(),
    T.Normalize(mean=MEAN, std=STD)])


train_path = '/content/data/hateful_memes/train.jsonl'
train_dataset = Load_Dataset(train_path, transforms)

dev_path = '/content/data/hateful_memes/dev_seen.jsonl'
dev_dataset = Load_Dataset(dev_path, transforms)

test_path = '/content/data/hateful_memes/test_seen.jsonl'
test_dataset = Load_Dataset(test_path, transforms)

## 2. Use CLIP to encode each modality to get image and text features

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_model, preprocess = clip.load('ViT-L/14', device=device)

# Initialize RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-offensive')
roberta_model = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-offensive').to(device)

100%|████████████████████████████████████████| 890M/890M [00:07<00:00, 133MiB/s]


Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-offensive were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-offensive and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

In [9]:
def encode_features(clip_model, roberta_model, tokenizer, dataset):
  all_features = []
  all_labels = []

  with torch.no_grad():
    for images, texts, labels in tqdm(DataLoader(dataset, batch_size=100)):
      # Encode images with CLIP
      image_input = torch.tensor(np.stack(images)).to(device)
      image_features = clip_model.encode_image(image_input).type(torch.float).to(device)
      # Encode text with RoBERTa
      text_tokens = tokenizer.batch_encode_plus(texts, padding=True, return_tensors='pt').input_ids.to(device)
      text_features = roberta_model(text_tokens)[0][:, 0, :].type(torch.float).to(device)
      # Concatenate the features from different modalities into one shared vector space
      features = torch.cat([image_features, text_features], dim=1)
      all_features.extend(features)
      all_labels.extend(labels)

  return all_features, all_labels


# Get image & text features from train/dev/test set
features_train, labels_train = encode_features(CLIP_model, roberta_model, tokenizer, train_dataset)
features_dev, labels_dev = encode_features(CLIP_model, roberta_model, tokenizer, dev_dataset)
features_test, labels_test = encode_features(CLIP_model, roberta_model, tokenizer, test_dataset)

100%|██████████| 85/85 [02:20<00:00,  1.65s/it]
100%|██████████| 5/5 [00:08<00:00,  1.64s/it]
100%|██████████| 10/10 [00:15<00:00,  1.59s/it]


In [10]:
class DefineDataset(torch.utils.data.Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __getitem__(self, index: int):
    return self.features[index], self.labels[index]

  def __len__(self):
    return len(self.features)


# Define train/dev/test set using image & text features and gold labels
train_set = DefineDataset(features_train, labels_train)
dev_set = DefineDataset(features_dev, labels_dev)
test_set = DefineDataset(features_test, labels_test)

## 3. Construct a neural network for classification

In [11]:
torch.manual_seed(515)
input_shape = features_train[0].shape[0]
num_classes = 2
shape = 256

nn_model = nn.Sequential(
    nn.Linear(input_shape, shape),
    nn.Dropout(0.66),
    nn.BatchNorm1d(shape),
    nn.ReLU(inplace=True),

    nn.Linear(shape, shape),
    nn.Dropout(0.66),
    nn.BatchNorm1d(shape),
    nn.ReLU(inplace=True),

    nn.Linear(shape, num_classes),)

nn_model = nn_model.to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8)

## 4. Train and validate the classification model and save the best model after 100 epochs of training

In [12]:
def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']  # Retrieve the learning rate value from optimizer

def compute_auroc(model, loader):
  """
  Compute AUROC on the dataset wrapped in a loader
  Return: AUROC score as a float value between 0 and 1
  """
  model.eval()
  real_labels = []
  probabilities = []

  for i_step, (x, y) in enumerate(loader):
    x = x.to(device)
    y = y.to(device)
    prediction = model(x)
    # select probabilities corresponding to the positive class
    prediction = prediction[:, 1]  # positive class in the second column
    probabilities.extend(prediction.detach().cpu().numpy())
    real_labels.extend(y.detach().cpu().numpy())

  auroc = roc_auc_score(real_labels, probabilities)*100
  return auroc

In [13]:
train_loader = DataLoader(train_set, batch_size=500)
val_loader = DataLoader(dev_set, batch_size=500)
top_val_AUROC = 70

for epoch in range(100):
  nn_model.train()
  loss_accum = 0
  for i_step, (x, y) in enumerate(train_loader):
    x = x.to(device)
    y = y.to(device)
    prediction = nn_model(x)
    loss_value = loss(prediction, y.type(torch.long))
    optimizer.zero_grad()
    loss_value.backward()
    optimizer.step()
    loss_accum += loss_value

  ave_loss = loss_accum / (i_step + 1)
  val_AUROC = compute_auroc(nn_model, val_loader)
  print("Epoch: %i lr: %f; Average loss: %f, Val AUROC: %f" % (epoch, get_lr(optimizer), ave_loss, val_AUROC))

  if scheduler != None:
    scheduler.step()

  if val_AUROC > top_val_AUROC:
    top_val_AUROC = val_AUROC
    best_model_name = f'Best_model_{round(val_AUROC, 2)}.ckpt'
    torch.save(nn_model, open(best_model_name, 'wb'))
    print("saved", best_model_name)

Epoch: 0 lr: 0.010000; Average loss: 0.626360, Val AUROC: 72.186395
saved Best_model_72.19.ckpt
Epoch: 1 lr: 0.010000; Average loss: 0.515817, Val AUROC: 74.117873
saved Best_model_74.12.ckpt
Epoch: 2 lr: 0.010000; Average loss: 0.463549, Val AUROC: 74.941992
saved Best_model_74.94.ckpt
Epoch: 3 lr: 0.010000; Average loss: 0.440608, Val AUROC: 75.513274
saved Best_model_75.51.ckpt
Epoch: 4 lr: 0.010000; Average loss: 0.427047, Val AUROC: 76.375798
saved Best_model_76.38.ckpt
Epoch: 5 lr: 0.010000; Average loss: 0.418293, Val AUROC: 76.846266
saved Best_model_76.85.ckpt
Epoch: 6 lr: 0.010000; Average loss: 0.411692, Val AUROC: 76.151766
Epoch: 7 lr: 0.010000; Average loss: 0.394203, Val AUROC: 76.892673
saved Best_model_76.89.ckpt
Epoch: 8 lr: 0.010000; Average loss: 0.382214, Val AUROC: 77.551967
saved Best_model_77.55.ckpt
Epoch: 9 lr: 0.010000; Average loss: 0.377035, Val AUROC: 77.580772
saved Best_model_77.58.ckpt
Epoch: 10 lr: 0.008000; Average loss: 0.349952, Val AUROC: 77.659183

## 5. test the best model

In [14]:
best_model = torch.load(open(best_model_name, 'rb'))
print(best_model)

Sequential(
  (0): Linear(in_features=1536, out_features=256, bias=True)
  (1): Dropout(p=0.66, inplace=False)
  (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): ReLU(inplace=True)
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): Dropout(p=0.66, inplace=False)
  (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): ReLU(inplace=True)
  (8): Linear(in_features=256, out_features=2, bias=True)
)


In [15]:
best_model = best_model.to(device)
best_model.eval()

test_loader = DataLoader(test_set, batch_size=500)
real_labels = []
predictions = []

for i_step, (x, y) in enumerate(test_loader):
  x = x.to(device)
  y = y.to(device)
  prediction = best_model(x)
  # select probabilities corresponding to the positive class
  prediction = prediction[:, 1]  # positive class in the second column
  predictions.extend(prediction.detach().cpu().numpy())
  real_labels.extend(y.detach().cpu().numpy())

auroc_score = roc_auc_score(real_labels, predictions)*100
print('AUROC:', round(auroc_score, 2))

AUROC: 81.92
