In [1]:
!nvidia-smi

Wed Jul 19 21:47:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    48W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

! unzip -q '/content/drive/MyDrive/hateful_memes.zip' -d '/content/data'

Mounted at /content/drive


In [3]:
! pip install --quiet ftfy regex tqdm
! pip install --quiet git+https://github.com/openai/CLIP.git

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone


In [4]:
import os
import csv
import json
import pandas as pd
import numpy as np
import matplotlib.image as img

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR, LinearLR

import torchvision
from torchvision import transforms as T
import torchvision.transforms.functional as F

import cv2
import clip
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score

## 1. Integrate image captioning features as another column to the dataset

In [5]:
# Read the caption information and store them in a dictionary using img path as key
caption = {}
with open('/content/drive/MyDrive/BLIP_2_caption.csv', 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  for row in csvreader:
    caption[row[0]] = row[1]

file_path = ['/content/data/hateful_memes/train.jsonl',
             '/content/data/hateful_memes/dev_seen.jsonl',
             '/content/data/hateful_memes/test_seen.jsonl']

for path in file_path:
  # Add caption information as another column to the dataset
  data = []
  with open(path, 'r') as jsonfile:
    for line in jsonfile:
      data.append(json.loads(line))

  combined_data = []
  for item in data:
    img = item['img']
    combined_data.append({'id': item['id'], 'img': img, 'label': item['label'], 'text': item['text'], 'caption': caption[img]})

  os.remove(path)
  with open(path, 'w') as file:
    for i in combined_data:
      file.write(json.dumps(i) + '\n')

## 2. Load dataset

In [6]:
class ScaleMaxSideToSize(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    sample = cv2.resize(sample, (self.size, self.size), interpolation=cv2.INTER_AREA)

    return sample


class CropCenter(object):
  def __init__(self, size):
    self.size = size

  def __call__(self, sample):
    img = sample
    h, w, _ = img.shape
    margin_h = (h - self.size) // 2
    margin_w = (w - self.size) // 2
    sample = img[margin_h:margin_h + self.size, margin_w:margin_w + self.size]

    return sample

In [7]:
class Load_Dataset(torch.utils.data.Dataset):
  def __init__(self, data_path, transforms):
    self.data = [json.loads(l) for l in open(data_path)]
    self.data_dir = os.path.dirname(data_path)
    self.transforms = transforms

  def __getitem__(self, index: int):
    path = os.path.join(self.data_dir, self.data[index]["img"])
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    id = self.data[index]["id"]
    text = self.data[index]["text"]
    label = self.data[index]["label"]
    caption = self.data[index]["caption"]

    if self.transforms is not None:
        image = self.transforms(image)

    return id, image, text, label, caption

  def __len__(self):
    return len(self.data)

In [8]:
CROP_SIZE = 224
MEAN = torch.tensor([0.485, 0.456, 0.406])
STD = torch.tensor([0.229, 0.224, 0.225])

transforms = T.Compose([
    ScaleMaxSideToSize(CROP_SIZE),
    CropCenter(CROP_SIZE),
    T.ToTensor(),
    T.Normalize(mean=MEAN, std=STD)])


train_path = '/content/data/hateful_memes/train.jsonl'
train_dataset = Load_Dataset(train_path, transforms)

dev_path = '/content/data/hateful_memes/dev_seen.jsonl'
dev_dataset = Load_Dataset(dev_path, transforms)

test_path = '/content/data/hateful_memes/test_seen.jsonl'
test_dataset = Load_Dataset(test_path, transforms)

## 3. Use CLIP to encode each modality to get image, text and caption features

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_model, preprocess = clip.load('ViT-L/14', device=device)

100%|████████████████████████████████████████| 890M/890M [00:06<00:00, 137MiB/s]


In [10]:
def encode_features(model, dataset):
  all_ids = []
  all_features = []
  all_labels = []

  with torch.no_grad():
    for ids, images, texts, labels, caption in tqdm(DataLoader(dataset, batch_size=100)):
      image_input = torch.tensor(np.stack(images)).to(device)
      text_tokens = clip.tokenize([desc[:77] for desc in texts]).to(device)
      caption_tokens = clip.tokenize([desc for desc in caption]).to(device)

      image_features = model.encode_image(image_input).type(torch.float).to(device)
      text_features = model.encode_text(text_tokens).type(torch.float).to(device)
      caption_features = model.encode_text(caption_tokens).type(torch.float).to(device)

      features = torch.cat([image_features, text_features, caption_features], dim=1)
      all_ids.extend(ids)
      all_features.extend(features)
      all_labels.extend(labels)

  return all_ids, all_features, all_labels


# Use CLIP to encode each modality to get image, text and caption features
ids_train, features_train, labels_train = encode_features(CLIP_model, train_dataset)
ids_dev, features_dev, labels_dev = encode_features(CLIP_model, dev_dataset)
ids_test, features_test, labels_test = encode_features(CLIP_model, test_dataset)

100%|██████████| 85/85 [02:23<00:00,  1.69s/it]
100%|██████████| 5/5 [00:08<00:00,  1.68s/it]
100%|██████████| 10/10 [00:16<00:00,  1.64s/it]


In [11]:
class DefineDataset(torch.utils.data.Dataset):
  def __init__(self, ids, features, labels):
    self.ids = ids
    self.features = features
    self.labels = labels

  def __getitem__(self, index: int):
    return self.ids[index], self.features[index], self.labels[index]

  def __len__(self):
    return len(self.features)


# Define train/dev/test set using image & text features and gold labels
train_set = DefineDataset(ids_train, features_train, labels_train)
dev_set = DefineDataset(ids_dev, features_dev, labels_dev)
test_set = DefineDataset(ids_test, features_test, labels_test)

## 4. Hhyper-parameters tuning for maximum epochs, learning rate, scheduler type, and save all best performing models based on the validation AUROC score

In [12]:
def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']  # Retrieve the learning rate value from optimizer

def compute_auroc(model, loader):
  """
  Compute AUROC on the dataset wrapped in a loader
  Return: AUROC score as a float value between 0 and 1
  """
  model.eval()
  real_labels = []
  probabilities = []

  for i_step, (i, x, y) in enumerate(loader):
    x = x.to(device)
    y = y.to(device)
    prediction = model(x)
    # select probabilities corresponding to the positive class
    prediction = prediction[:, 1]  # positive class in the second column
    probabilities.extend(prediction.detach().cpu().numpy())
    real_labels.extend(y.detach().cpu().numpy())

  auroc = roc_auc_score(real_labels, probabilities)*100
  return auroc

In [13]:
input_shape = features_train[0].shape[0]
num_classes = 2
torch.manual_seed(78)
shape = 256

# Define hyperparameters
dropout_rates = [0.6, 0.7]
activation_functions = ['ReLU', 'LeakyReLU', 'ELU']
optimizers = ['AdamW', 'Adam']
learning_rates = [1e-2, 1e-3]
schedulers = ['StepLR', 'LinearLR']
top_val_AUROC = 81.79

best_models = []

for dropout_rate in dropout_rates:
  for act_func in activation_functions:
    for optimizer_name in optimizers:
      for lr in learning_rates:
        for sched in schedulers:

          # Construct a neural network for classification
          nn_model = nn.Sequential(
              nn.Linear(input_shape, shape),
              nn.Dropout(dropout_rate),
              nn.BatchNorm1d(shape),
              getattr(nn, act_func)(inplace=True),

              nn.Linear(shape, shape),
              nn.Dropout(dropout_rate),
              nn.BatchNorm1d(shape),
              getattr(nn, act_func)(inplace=True),

              nn.Linear(shape, num_classes),)

          nn_model = nn_model.to(device)
          loss = nn.CrossEntropyLoss()
          optimizer = torch.optim.Adam(nn_model.parameters(), lr=lr)

          if optimizer_name == 'Adam':
            optimizer = torch.optim.Adam(nn_model.parameters(), lr=lr)
          if optimizer_name == 'AdamW':
            optimizer = torch.optim.AdamW(nn_model.parameters(), lr=lr)

          if sched == 'StepLR':
            scheduler = StepLR(optimizer, step_size=10, gamma=0.8)
          if sched == 'LinearLR':
            scheduler = LinearLR(optimizer)

          train_loader = DataLoader(train_set, batch_size=500)
          val_loader = DataLoader(dev_set, batch_size=500)

          for epoch in range(100):
            nn_model.train()
            loss_accum = 0
            for i_step, (i, x, y) in enumerate(train_loader):
              x = x.to(device)
              y = y.to(device)
              prediction = nn_model(x)
              loss_value = loss(prediction, y.type(torch.long))
              optimizer.zero_grad()
              loss_value.backward()
              optimizer.step()
              loss_accum += loss_value

            ave_loss = loss_accum / (i_step + 1)
            val_AUROC = compute_auroc(nn_model, val_loader)
            if scheduler != None:
              scheduler.step()

            # Save the best models based on validation AUROC
            if val_AUROC > top_val_AUROC:
              top_val_AUROC = val_AUROC
              m_name = f'Epoch_{epoch}_Dropout_{dropout_rate}_Activation_{act_func}_Optimizer_{optimizer_name}_Learning_rate_{lr}_Scheduler_{sched}.ckpt'
              torch.save(nn_model, open(m_name, 'wb'))
              best_models.append(m_name)
              print('Saved Model:', m_name, ' Validation AUROC:', round(val_AUROC, 2))

Saved Model: Epoch_57_Dropout_0.6_Activation_ReLU_Optimizer_AdamW_Learning_rate_0.01_Scheduler_StepLR.ckpt  Validation AUROC: 81.79
Saved Model: Epoch_15_Dropout_0.6_Activation_ReLU_Optimizer_Adam_Learning_rate_0.01_Scheduler_StepLR.ckpt  Validation AUROC: 82.09
Saved Model: Epoch_50_Dropout_0.6_Activation_ReLU_Optimizer_Adam_Learning_rate_0.01_Scheduler_LinearLR.ckpt  Validation AUROC: 82.16
Saved Model: Epoch_83_Dropout_0.6_Activation_LeakyReLU_Optimizer_AdamW_Learning_rate_0.01_Scheduler_LinearLR.ckpt  Validation AUROC: 82.2
Saved Model: Epoch_87_Dropout_0.6_Activation_LeakyReLU_Optimizer_AdamW_Learning_rate_0.01_Scheduler_LinearLR.ckpt  Validation AUROC: 82.33
Saved Model: Epoch_68_Dropout_0.7_Activation_ReLU_Optimizer_AdamW_Learning_rate_0.01_Scheduler_LinearLR.ckpt  Validation AUROC: 82.45
Saved Model: Epoch_50_Dropout_0.7_Activation_LeakyReLU_Optimizer_AdamW_Learning_rate_0.01_Scheduler_LinearLR.ckpt  Validation AUROC: 82.47
Saved Model: Epoch_88_Dropout_0.7_Activation_LeakyReLU

## 5. Perform soft voting method for ensemble learning by averaging the predictions of best performing models

In [14]:
# Define functions to calculate average predicted probabilities and labels from all models

def calculate_average_proba(arrays_list):
    num_arrays = len(arrays_list)
    array_size = len(arrays_list[0])
    averaged_proba = []

    for i in range(array_size):
        total = sum(arr[i] for arr in arrays_list)
        avg = total / num_arrays
        averaged_proba.append(avg)

    return averaged_proba

def calculate_average_label(arrays_list):
    num_sublists = len(arrays_list)
    sublist_size = len(arrays_list[0])
    averaged_labels = []

    for i in range(sublist_size):
        total = sum(sublist[i] for sublist in arrays_list)
        avg = total / num_sublists
        avg_label = 1 if avg >= 0.5 else 0
        averaged_labels.append(avg_label)

    return averaged_labels

In [15]:
models = [] # Load best performing models and store them in a list
for best_model_name in best_models:
  best_model = torch.load(open(best_model_name, 'rb'))
  best_model = best_model.to(device)
  best_model.eval()
  models.append(best_model)

In [16]:
val_loader = DataLoader(dev_set, batch_size=500)
val_real_label = []
val_pred_label = []
val_pred_proba = []

for i_step, (i, x, y) in enumerate(val_loader):
  x = x.to(device)
  y = y.to(device)
  val_real_label.extend(y.detach().cpu().numpy())

  # Get predicted probabilities and labels from each model
  for model in models:
    prediction = model(x)
    val_pred_proba.append(prediction[:, 1].detach().cpu().numpy())
    val_pred_label.append(torch.max(prediction.cpu(), 1)[1])

# Calculate average predicted probabilities and labels from all models
val_ensemble_proba = calculate_average_proba(val_pred_proba)
val_ensemble_label = calculate_average_label(val_pred_label)

# Compute the AUROC score for the ensemble predictions on validation set
auroc_score = roc_auc_score(val_real_label, val_ensemble_proba)*100
print('Validation AUROC:', round(auroc_score, 2))

Validation AUROC: 83.46


In [17]:
test_loader = DataLoader(test_set, batch_size=1000)
test_real_label = []
test_pred_label = []
test_pred_proba = []

for i_step, (i, x, y) in enumerate(test_loader):
  x = x.to(device)
  y = y.to(device)
  test_real_label.extend(y.detach().cpu().numpy())

  # Get predicted probabilities and labels from each model
  for model in models:
    prediction = model(x)
    test_pred_proba.append(prediction[:, 1].detach().cpu().numpy())
    test_pred_label.append(torch.max(prediction.cpu(), 1)[1])

# Calculate average predicted probabilities and labels from all models
test_ensemble_proba = calculate_average_proba(test_pred_proba)
test_ensemble_label = calculate_average_label(test_pred_label)

# Compute the AUROC score for the ensemble predictions on test set
auroc_score = roc_auc_score(test_real_label, test_ensemble_proba)*100
print('Test AUROC:', round(auroc_score, 2))

Test AUROC: 83.23


In [18]:
# Save the prediction results as a csv file
data = {'id': ids_test, 'proba': test_ensemble_proba, 'label': test_ensemble_label}
results = pd.DataFrame(data)
file_path = "/content/data/CLIP_ensemble.csv"
results.to_csv(file_path, index=False)

# Save the csv file in Google Drive
!cp -r /content/data/CLIP_ensemble.csv /content/drive/MyDrive/results