In [1]:
!nvidia-smi

Mon Jul 10 14:44:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    48W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

! unzip -q '/content/drive/MyDrive/hateful_memes.zip' -d '/content/data'

Mounted at /content/drive


In [3]:
! pip install --quiet ftfy regex tqdm
! pip install --quiet git+https://github.com/openai/CLIP.git

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone


In [4]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.image as img

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR, LinearLR, CosineAnnealingLR

import torchvision
from torchvision import transforms as T
import torchvision.transforms.functional as F

import cv2
import clip
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score

In [5]:
class ScaleMaxSideToSize(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    sample = cv2.resize(sample, (self.size, self.size), interpolation=cv2.INTER_AREA)

    return sample


class CropCenter(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    img = sample
    h, w, _ = img.shape
    margin_h = (h - self.size) // 2
    margin_w = (w - self.size) // 2
    sample = img[margin_h:margin_h + self.size, margin_w:margin_w + self.size]

    return sample

## 1. Load dataset

In [6]:
class Load_Dataset(torch.utils.data.Dataset):
  def __init__(self, data_path, transforms):
    self.data = [json.loads(l) for l in open(data_path)]
    self.data_dir = os.path.dirname(data_path)
    self.transforms = transforms

  def __getitem__(self, index: int):
    path = os.path.join(self.data_dir, self.data[index]["img"])
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    text = self.data[index]["text"]
    label = self.data[index]["label"]

    if self.transforms is not None:
        image = self.transforms(image)

    return image, text, label

  def __len__(self):
    return len(self.data)

In [7]:
CROP_SIZE = 224
MEAN = torch.tensor([0.485, 0.456, 0.406])
STD = torch.tensor([0.229, 0.224, 0.225])

transforms = T.Compose([
    ScaleMaxSideToSize(CROP_SIZE),
    CropCenter(CROP_SIZE),
    T.ToTensor(),
    T.Normalize(mean=MEAN, std=STD)])


train_path = '/content/data/hateful_memes/train.jsonl'
train_dataset = Load_Dataset(train_path, transforms)

dev_path = '/content/data/hateful_memes/dev_seen.jsonl'
dev_dataset = Load_Dataset(dev_path, transforms)

test_path = '/content/data/hateful_memes/test_seen.jsonl'
test_dataset = Load_Dataset(test_path, transforms)

## 2. Use CLIP to encode each modality to get image and text features

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_model, preprocess = clip.load('ViT-L/14', device=device)

100%|████████████████████████████████████████| 890M/890M [00:08<00:00, 115MiB/s]


In [9]:
def encode_features(model, dataset):
  all_features = []
  all_labels = []

  with torch.no_grad():
    for images, texts, labels in tqdm(DataLoader(dataset, batch_size=100)):
      image_input = torch.tensor(np.stack(images)).to(device)
      text_tokens = clip.tokenize([desc[:77] for desc in texts]).to(device)

      image_features = model.encode_image(image_input).type(torch.float).to(device)
      text_features = model.encode_text(text_tokens).type(torch.float).to(device)

      features = torch.cat([image_features, text_features], dim=1)

      all_features.extend(features)
      all_labels.extend(labels)

  return all_features, all_labels


# Use CLIP to encode each modality to get image & text features
features_train, labels_train = encode_features(CLIP_model, train_dataset)
features_dev, labels_dev = encode_features(CLIP_model, dev_dataset)
features_test, labels_test = encode_features(CLIP_model, test_dataset)

100%|██████████| 85/85 [02:20<00:00,  1.65s/it]
100%|██████████| 5/5 [00:08<00:00,  1.64s/it]
100%|██████████| 10/10 [00:16<00:00,  1.61s/it]


In [10]:
class DefineDataset(torch.utils.data.Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __getitem__(self, index: int):
    return self.features[index], self.labels[index]

  def __len__(self):
    return len(self.features)


# Define train/dev/test set using image & text features and gold labels
train_set = DefineDataset(features_train, labels_train)
dev_set = DefineDataset(features_dev, labels_dev)
test_set = DefineDataset(features_test, labels_test)

## 3. Hhyper-parameters tuning for learning rate, batch size, scheduler type, and save all best performing models based on the validation AUROC score

In [11]:
def compute_auroc(model, loader):
  """
  Compute AUROC on the dataset wrapped in a loader
  Return: AUROC score as a float value between 0 and 1
  """
  model.eval()
  real_labels = []
  probabilities = []

  for i_step, (x, y) in enumerate(loader):
    x = x.to(device)
    y = y.to(device)
    prediction = model(x)
    # select probabilities corresponding to the positive class
    prediction = prediction[:, 1]  # positive class in the second column
    probabilities.extend(prediction.detach().cpu().numpy())
    real_labels.extend(y.detach().cpu().numpy())

  auroc = roc_auc_score(real_labels, probabilities)*100

  return auroc

In [12]:
input_shape = features_train[0].shape[0]
num_classes = 2
torch.manual_seed(515)
shape = 256

# Define hyperparameters
maximum_epochs = [50, 100]
learning_rates = [1e-2, 1e-3, 1e-4, 1e-5]
batch_sizes = [500, 900]
schedulers = ['StepLR', 'LinearLR', 'CosineAnnealingLR']
top_val_AUROC = 81.62

best_models = []

for epochs in maximum_epochs:
  for lr in learning_rates:
    for bs in batch_sizes:
      for sched in schedulers:
        # Construct a neural network for classification
        nn_model = nn.Sequential(
            nn.Linear(input_shape, shape),
            nn.Dropout(0.66),
            nn.BatchNorm1d(shape),
            nn.ReLU(inplace=True),

            nn.Linear(shape, shape),
            nn.Dropout(0.66),
            nn.BatchNorm1d(shape),
            nn.ReLU(inplace=True),

            nn.Linear(shape, num_classes),)

        nn_model = nn_model.to(device)
        loss = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(nn_model.parameters(), lr=lr)

        if sched == 'StepLR':
          scheduler = StepLR(optimizer, step_size=10, gamma=0.8)
        if sched == 'LinearLR':
          scheduler = LinearLR(optimizer)
        if sched == 'CosineAnnealingLR':
          scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)

        train_loader = DataLoader(train_set, batch_size=bs)
        val_loader = DataLoader(dev_set, batch_size=bs)

        for epoch in range(epochs):
          nn_model.train()
          loss_accum = 0
          for i_step, (x, y) in enumerate(train_loader):
            x = x.to(device)
            y = y.to(device)
            prediction = nn_model(x)
            loss_value = loss(prediction, y.type(torch.long))
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
            loss_accum += loss_value

          ave_loss = loss_accum / (i_step + 1)
          val_AUROC = compute_auroc(nn_model, val_loader)
          if scheduler != None:
            scheduler.step()

          # Save the best models based on validation AUROC
          if val_AUROC > top_val_AUROC:
            top_val_AUROC = val_AUROC
            m_name = f'Epoch_{epoch}_of_{epochs}_Learning_rate_{lr}_Batch_size_{bs}_Scheduler_{sched}.ckpt'
            torch.save(nn_model, open(m_name, 'wb'))
            best_models.append(m_name)
            print('Saved Model:', m_name, ' Validation AUROC:', round(val_AUROC, 2))

Saved Model: Epoch_43_of_50_Learning_rate_0.01_Batch_size_500_Scheduler_StepLR.ckpt  Validation AUROC: 81.62
Saved Model: Epoch_47_of_50_Learning_rate_0.01_Batch_size_500_Scheduler_LinearLR.ckpt  Validation AUROC: 81.99
Saved Model: Epoch_28_of_100_Learning_rate_0.01_Batch_size_500_Scheduler_StepLR.ckpt  Validation AUROC: 82.08
Saved Model: Epoch_61_of_100_Learning_rate_0.01_Batch_size_500_Scheduler_StepLR.ckpt  Validation AUROC: 82.15
Saved Model: Epoch_85_of_100_Learning_rate_0.01_Batch_size_500_Scheduler_LinearLR.ckpt  Validation AUROC: 82.41


## 4. Perform soft voting method for ensemble learning by averaging the predictions of best performing models

In [13]:
val_all_predictions = []

for best_model_name in best_models:
    best_model = torch.load(open(best_model_name, 'rb'))
    best_model = best_model.to(device)
    best_model.eval()

    val_loader = DataLoader(dev_set, batch_size=500)
    val_real_labels = []
    val_predictions = []

    for i_step, (x, y) in enumerate(val_loader):
        x = x.to(device)
        y = y.to(device)
        prediction = best_model(x)
        # select probabilities corresponding to the positive class
        prediction = prediction[:, 1]  # positive class in the second column
        val_predictions.extend(prediction.detach().cpu().numpy())
        val_real_labels.extend(y.detach().cpu().numpy())

    # append predictions to all_predictions
    val_all_predictions.append(val_predictions)

# convert list to numpy arrays for easier manipulation
val_all_predictions = np.array(val_all_predictions)

# perform soft voting by taking the average predicted probabilities
val_final_predictions = val_all_predictions.mean(axis=0)

# compute and print the AUROC for the final prediction
val_auroc_score = roc_auc_score(val_real_labels, val_final_predictions)*100
print('Validation AUROC:', round(val_auroc_score, 2))

Validation AUROC: 82.94


In [14]:
test_all_predictions = []

for best_model_name in best_models:
    best_model = torch.load(open(best_model_name, 'rb'))
    best_model = best_model.to(device)
    best_model.eval()

    test_loader = DataLoader(test_set, batch_size=500)
    test_real_labels = []
    test_predictions = []

    for i_step, (x, y) in enumerate(test_loader):
        x = x.to(device)
        y = y.to(device)
        prediction = best_model(x)
        # select probabilities corresponding to the positive class
        prediction = prediction[:, 1]  # positive class in the second column
        test_predictions.extend(prediction.detach().cpu().numpy())
        test_real_labels.extend(y.detach().cpu().numpy())

    # append predictions to all_predictions
    test_all_predictions.append(test_predictions)

# convert list to numpy arrays for easier manipulation
test_all_predictions = np.array(test_all_predictions)

# perform soft voting by taking the average predicted probabilities
test_final_predictions = test_all_predictions.mean(axis=0)

# compute and print the AUROC for the final prediction
test_auroc_score = roc_auc_score(test_real_labels, test_final_predictions)*100
print('Test AUROC:', round(test_auroc_score, 2))

Test AUROC: 83.82
