In [1]:
!nvidia-smi

Fri Jul  7 12:55:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

! unzip -q '/content/drive/MyDrive/hateful_memes.zip' -d '/content/data'

Mounted at /content/drive


In [3]:
! pip install --quiet ftfy regex tqdm
! pip install --quiet git+https://github.com/openai/CLIP.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone


In [4]:
import os
import csv
import json
import pandas as pd
import numpy as np
import matplotlib.image as img

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision
from torchvision import transforms as T
import torchvision.transforms.functional as F

import cv2
import clip
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score

## 1. Integrate image captioning features as another column to the dataset

In [5]:
# Read the caption information and store them in a dictionary using img path as key
caption = {}
with open('/content/drive/MyDrive/clipcap_caption.csv', 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  for row in csvreader:
    caption['img/'+row[0]] = row[1]

file_path = ['/content/data/hateful_memes/train.jsonl',
             '/content/data/hateful_memes/dev_seen.jsonl',
             '/content/data/hateful_memes/test_seen.jsonl']

for path in file_path:
  # Add caption information as another column to the dataset
  data = []
  with open(path, 'r') as jsonfile:
    for line in jsonfile:
      data.append(json.loads(line))

  combined_data = []
  for item in data:
    img = item['img']
    combined_data.append({'id': item['id'], 'img': img, 'label': item['label'], 'text': item['text'], 'caption': caption[img]})

  os.remove(path)
  with open(path, 'w') as file:
    for i in combined_data:
      file.write(json.dumps(i) + '\n')

## 2. Load dataset

In [6]:
class ScaleMaxSideToSize(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    sample = cv2.resize(sample, (self.size, self.size), interpolation=cv2.INTER_AREA)

    return sample


class CropCenter(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    img = sample
    h, w, _ = img.shape
    margin_h = (h - self.size) // 2
    margin_w = (w - self.size) // 2
    sample = img[margin_h:margin_h + self.size, margin_w:margin_w + self.size]

    return sample

In [7]:
class Load_Dataset(torch.utils.data.Dataset):
  def __init__(self, data_path, transforms):
    self.data = [json.loads(l) for l in open(data_path)]
    self.data_dir = os.path.dirname(data_path)
    self.transforms = transforms

  def __getitem__(self, index: int):
    path = os.path.join(self.data_dir, self.data[index]["img"])
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    text = self.data[index]["text"]
    label = self.data[index]["label"]
    caption = self.data[index]["caption"]

    if self.transforms is not None:
        image = self.transforms(image)

    return image, text, label, path, caption

  def __len__(self):
    return len(self.data)

In [8]:
CROP_SIZE = 336
MEAN = torch.tensor([0.485, 0.456, 0.406])
STD = torch.tensor([0.229, 0.224, 0.225])

transforms = T.Compose([
    ScaleMaxSideToSize(CROP_SIZE),
    CropCenter(CROP_SIZE),
    T.ToTensor(),
    T.Normalize(mean=MEAN, std=STD)])


train_path = '/content/data/hateful_memes/train.jsonl'
train_dataset = Load_Dataset(train_path, transforms)

dev_path = '/content/data/hateful_memes/dev_seen.jsonl'
dev_dataset = Load_Dataset(dev_path, transforms)

test_path = '/content/data/hateful_memes/test_seen.jsonl'
test_dataset = Load_Dataset(test_path, transforms)

## 3. Use CLIP to encode each modality to get image, text and caption features

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_model, preprocess = clip.load('ViT-L/14@336px', device=device)

100%|████████████████████████████████████████| 891M/891M [00:08<00:00, 104MiB/s]


In [10]:
def encode_features(model, dataset):
  all_features = []
  all_labels = []

  with torch.no_grad():
    for images, texts, labels, path, caption in tqdm(DataLoader(dataset, batch_size=100)):
      image_input = torch.tensor(np.stack(images)).to(device)
      text_tokens = clip.tokenize([desc[:77] for desc in texts]).to(device)
      caption_tokens = clip.tokenize(desc for desc in caption).to(device)

      image_features = model.encode_image(image_input).type(torch.float).to(device)
      text_features = model.encode_text(text_tokens).type(torch.float).to(device)
      caption_features = model.encode_text(caption_tokens).type(torch.float).to(device)

      features = torch.cat([image_features, text_features, caption_features], dim=1)
      all_features.extend(features)
      all_labels.extend(labels)

  return all_features, all_labels


# Use CLIP to encode each modality to get image, text and caption features
features_train, labels_train = encode_features(CLIP_model, train_dataset)
features_dev, labels_dev = encode_features(CLIP_model, dev_dataset)
features_test, labels_test = encode_features(CLIP_model, test_dataset)

100%|██████████| 85/85 [02:45<00:00,  1.95s/it]
100%|██████████| 5/5 [00:09<00:00,  1.92s/it]
100%|██████████| 10/10 [00:19<00:00,  1.90s/it]


In [11]:
class DefineDataset(torch.utils.data.Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __getitem__(self, index: int):
    return self.features[index], self.labels[index]

  def __len__(self):
    return len(self.features)


# Define train/dev/test set using image/text/caption features and gold labels
train_set = DefineDataset(features_train, labels_train)
dev_set = DefineDataset(features_dev, labels_dev)
test_set = DefineDataset(features_test, labels_test)

## 4. Construct a neural network for classification

In [12]:
torch.manual_seed(515)
input_shape = features_train[0].shape[0]
num_classes = 2
shape = 256

nn_model = nn.Sequential(
    nn.Linear(input_shape, shape),
    nn.Dropout(0.66),
    nn.BatchNorm1d(shape),
    nn.ReLU(inplace=True),

    nn.Linear(shape, shape),
    nn.Dropout(0.66),
    nn.BatchNorm1d(shape),
    nn.ReLU(inplace=True),

    nn.Linear(shape, num_classes),)

nn_model = nn_model.to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8)

## 5. Train and validate the classification model and save the best model after 100 epochs of training

In [13]:
def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']  # Retrieve the learning rate value from optimizer

def compute_auroc(model, loader):
  """
  Compute AUROC on the dataset wrapped in a loader
  Return: AUROC score as a float value between 0 and 1
  """
  model.eval()
  real_labels = []
  probabilities = []

  for i_step, (x, y) in enumerate(loader):
    x = x.to(device)
    y = y.to(device)
    prediction = model(x)
    # select probabilities corresponding to the positive class
    prediction = prediction[:, 1]  # positive class in the second column
    probabilities.extend(prediction.detach().cpu().numpy())
    real_labels.extend(y.detach().cpu().numpy())

  auroc = roc_auc_score(real_labels, probabilities)*100
  return auroc

In [14]:
train_loader = DataLoader(train_set, batch_size=500)
val_loader = DataLoader(dev_set, batch_size=500)
top_val_AUROC = 70

for epoch in range(100):
  nn_model.train()
  loss_accum = 0
  for i_step, (x, y) in enumerate(train_loader):
    x = x.to(device)
    y = y.to(device)
    prediction = nn_model(x)
    loss_value = loss(prediction, y.type(torch.long))
    optimizer.zero_grad()
    loss_value.backward()
    optimizer.step()
    loss_accum += loss_value

  ave_loss = loss_accum / (i_step + 1)
  val_AUROC = compute_auroc(nn_model, val_loader)
  print("Epoch: %i lr: %f; Average loss: %f, Val AUROC: %f" % (epoch, get_lr(optimizer), ave_loss, val_AUROC))

  if scheduler != None:
    scheduler.step()

  if val_AUROC > top_val_AUROC:
    top_val_AUROC = val_AUROC
    best_model_name = f'Best_model_{round(val_AUROC, 2)}.ckpt'
    torch.save(nn_model, open(best_model_name, 'wb'))
    print("saved", best_model_name)

Epoch: 0 lr: 0.010000; Average loss: 0.605628, Val AUROC: 71.952761
saved Best_model_71.95.ckpt
Epoch: 1 lr: 0.010000; Average loss: 0.495799, Val AUROC: 74.236290
saved Best_model_74.24.ckpt
Epoch: 2 lr: 0.010000; Average loss: 0.429401, Val AUROC: 75.466867
saved Best_model_75.47.ckpt
Epoch: 3 lr: 0.010000; Average loss: 0.401561, Val AUROC: 76.555024
saved Best_model_76.56.ckpt
Epoch: 4 lr: 0.010000; Average loss: 0.373654, Val AUROC: 77.619177
saved Best_model_77.62.ckpt
Epoch: 5 lr: 0.010000; Average loss: 0.356107, Val AUROC: 77.719992
saved Best_model_77.72.ckpt
Epoch: 6 lr: 0.010000; Average loss: 0.332510, Val AUROC: 78.009633
saved Best_model_78.01.ckpt
Epoch: 7 lr: 0.010000; Average loss: 0.318210, Val AUROC: 78.360084
saved Best_model_78.36.ckpt
Epoch: 8 lr: 0.010000; Average loss: 0.285812, Val AUROC: 78.284873
Epoch: 9 lr: 0.010000; Average loss: 0.283041, Val AUROC: 77.848010
Epoch: 10 lr: 0.008000; Average loss: 0.244952, Val AUROC: 78.230465
Epoch: 11 lr: 0.008000; Ave

## 6. test the best model

In [15]:
best_model = torch.load(open(best_model_name, 'rb'))
print(best_model)

Sequential(
  (0): Linear(in_features=2304, out_features=256, bias=True)
  (1): Dropout(p=0.66, inplace=False)
  (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): ReLU(inplace=True)
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): Dropout(p=0.66, inplace=False)
  (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): ReLU(inplace=True)
  (8): Linear(in_features=256, out_features=2, bias=True)
)


In [16]:
best_model = best_model.to(device)
best_model.eval()

test_loader = DataLoader(test_set, batch_size=500)
real_labels = []
predictions = []

for i_step, (x, y) in enumerate(test_loader):
  x = x.to(device)
  y = y.to(device)
  prediction = best_model(x)
  # select probabilities corresponding to the positive class
  prediction = prediction[:, 1]  # positive class in the second column
  predictions.extend(prediction.detach().cpu().numpy())
  real_labels.extend(y.detach().cpu().numpy())

auroc_score = roc_auc_score(real_labels, predictions)*100
print('AUROC:', round(auroc_score, 2))

AUROC: 80.97
