In [37]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import os

os.chdir("/content/drive/My Drive/Colab Notebooks/cs7643")
print("Current working directory:", os.getcwd())

Current working directory: /content/drive/My Drive/Colab Notebooks/cs7643


In [39]:
import os
import urllib.request
import tarfile

# URL of the Hateful Memes features archive
URL = "https://dl.fbaipublicfiles.com/mmf/data/datasets/hateful_memes/defaults/features/features_2020_10_01.tar.gz"
TAR_PATH = "features_2020_10_01.tar.gz"
EXTRACT_DIR = "detectron.lmdb"

# 1. Download the tar.gz file (if not already downloaded)
if not os.path.exists(TAR_PATH):
    print(f"Downloading from {URL} ...")
    urllib.request.urlretrieve(URL, TAR_PATH)
    print(f"Downloaded to {TAR_PATH}")
else:
    print(f"{TAR_PATH} already exists, skipping download.")

# 2. Extract the tar.gz file (if not already extracted)
if not os.path.exists(EXTRACT_DIR):
    print(f"Extracting {TAR_PATH} ...")
    with tarfile.open(TAR_PATH, "r:gz") as tar:
        tar.extractall()
    print(f"Extraction complete. Files are in: {EXTRACT_DIR}/")
else:
    print(f"{EXTRACT_DIR}/ already exists, skipping extraction.")

features_2020_10_01.tar.gz already exists, skipping download.
detectron.lmdb/ already exists, skipping extraction.


In [40]:
!pip install lmdb



In [41]:
import lmdb
import torch
import pickle
from torch.utils.data import Dataset


class HatefulMemesDataset(Dataset):
    def __init__(self, hf_split, lmdb_path, tokenizer):
        """
        hf_split: one split from the HF DatasetDict (e.g. hf_ds['train'])
        lmdb_env: opened lmdb.Environment
        tokenizer: HuggingFace tokenizer (optional)
        """
        self.data = hf_split
        self.lmdb_path = lmdb_path
        self.env = None
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def _get_image_id(self, img_path):
        # "img/40259.png" -> "40259"
        return img_path.split("/")[-1].split(".")[0]

    def _load_visual_feats(self, img_id):
        if self.env is None:  # opened separately in each worker
            self.env = lmdb.open(
                self.lmdb_path,
                readonly=True,
                lock=False,
                readahead=False,
                meminit=False,
            )
        with self.env.begin(write=False) as txn:
            buf = txn.get(img_id.encode("utf-8"))
            sample = pickle.loads(buf)
        feats = torch.tensor(
            sample["features"], dtype=torch.float32
        )  # (num_boxes, 2048)
        bbox = torch.tensor(sample["bbox"], dtype=torch.float32)  # (num_boxes, 4)
        return feats, bbox

    def __getitem__(self, idx):
        row = self.data[idx]
        text = row["text"]
        img_path = row["img"]
        label = row["label"]

        img_id = self._get_image_id(img_path)
        visual_embeds, _ = self._load_visual_feats(img_id)

        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=48,
            return_tensors="pt",
        )

        item = {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "token_type_ids": encoded["token_type_ids"].squeeze(0),
            "visual_embeds": visual_embeds,
            "visual_attention_mask": torch.ones(
                visual_embeds.size(0), dtype=torch.long
            ),
            "visual_token_type_ids": torch.zeros(
                visual_embeds.size(0), dtype=torch.long
            ),
            "label": torch.tensor(label, dtype=torch.long),
        }

        return item

In [42]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import VisualBertModel, VisualBertConfig, BertTokenizer

lmdb_path = "detectron.lmdb"
dataset = load_dataset("neuralcatcher/hateful_memes")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

loader_train = DataLoader(
    HatefulMemesDataset(dataset["train"], lmdb_path, tokenizer),
    batch_size=128,
    shuffle=True,
    num_workers=16,
)

loader_validation = DataLoader(
    HatefulMemesDataset(dataset["validation"], lmdb_path, tokenizer),
    batch_size=265,
    shuffle=False,
    num_workers=16,
)

loader_test = DataLoader(
    HatefulMemesDataset(dataset["test"], lmdb_path, tokenizer),
    batch_size=265,
    shuffle=False,
    num_workers=16,
)

Repo card metadata block was not found. Setting CardData to empty.


In [43]:
import torch.nn as nn
from transformers import VisualBertModel


class VisualBertForClassification(nn.Module):
    def __init__(self, num_labels=2):
        super().__init__()
        self.visualbert = VisualBertModel.from_pretrained(
            "uclanlp/visualbert-vqa-coco-pre"
        )
        hidden_size = self.visualbert.config.hidden_size  # usually 768
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        visual_embeds,
        visual_attention_mask,
        visual_token_type_ids,
    ):
        outputs = self.visualbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            visual_embeds=visual_embeds,
            visual_attention_mask=visual_attention_mask,
            visual_token_type_ids=visual_token_type_ids,
        )

        pooled = outputs.pooler_output  # (B, 768)
        logits = self.classifier(pooled)
        return logits

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

model = VisualBertForClassification().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

Using cuda


In [45]:
keys2pass = [
    "input_ids",
    "attention_mask",
    "token_type_ids",
    "visual_embeds",
    "visual_attention_mask",
    "visual_token_type_ids",
]

for epoch in range(10):
    model.train()
    total_loss = 0
    print(f"Epoch {epoch+1} | ",  end='')
    for batch in loader_train:
        batch = {k: v.to(device) for k, v in batch.items()}

        logits = model(**{k: batch[k] for k in keys2pass})
        loss = criterion(logits, batch["label"])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train Loss: {total_loss:.4f} | ",  end='')
    # ---- VALIDATION ----
    model.eval()
    val_loss = 0
    with torch.inference_mode():
        for batch in loader_validation:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**{k: batch[k] for k in keys2pass})
            loss = criterion(logits, batch["label"])
            val_loss += loss.item()
    print(f"Val Loss: {val_loss:.4f}")

Epoch 1 | Train Loss: 41.1347 | Val Loss: 2.8829
Epoch 2 | Train Loss: 33.2356 | Val Loss: 3.1917
Epoch 3 | Train Loss: 26.9544 | Val Loss: 3.5634
Epoch 4 | Train Loss: 22.4512 | Val Loss: 3.3632
Epoch 5 | Train Loss: 17.9093 | Val Loss: 3.8379
Epoch 6 | Train Loss: 14.8730 | Val Loss: 4.5083
Epoch 7 | Train Loss: 12.5656 | Val Loss: 4.6028
Epoch 8 | Train Loss: 9.3023 | Val Loss: 4.5283
Epoch 9 | Train Loss: 7.5412 | Val Loss: 6.1034
Epoch 10 | Train Loss: 6.7044 | Val Loss: 5.5882


In [46]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

model.eval()
all_preds = []
all_probs = []
all_labels = []
test_loss = 0

with torch.inference_mode():
    for batch in loader_test:
        batch = {k: v.to(device) for k, v in batch.items()}

        logits = model(**{k: batch[k] for k in keys2pass})
        loss = criterion(logits, batch["label"])
        test_loss += loss.item()

        probs = logits.softmax(dim=1)[:, 1].cpu().numpy()
        preds = logits.argmax(dim=1).cpu().numpy()
        labels = batch["label"].cpu().numpy()

        all_probs.extend(probs)
        all_preds.extend(preds)
        all_labels.extend(labels)

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
auc = roc_auc_score(all_labels, all_probs)

print("Test Loss:", test_loss)
print("Test Acc:", acc)
print("Test F1:", f1)
print("Test AUC:", auc)

Test Loss: 14.268611058592796
Test Acc: 0.6753333333333333
Test F1: 0.508577194752775
Test AUC: 0.7156644061583577
