In [1]:
import json
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.models import resnet50
from torchvision import models
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

import torch.nn as nn
import torch.optim as optim


In [2]:
torch.cuda.empty_cache()

In [3]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', do_lower_case=True)

class HatefulMemeDataset(Dataset):
    def __init__(self, json_file, tokenizer=None):
        self.tokenizer = tokenizer
        if tokenizer is None:
            self.tokenizer = BertTokenizer.from_pretrained(
                'bert-base-uncased', do_lower_case=True)

        with open(json_file, "r") as f:
            self.data = [json.loads(line) for line in f]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # img_path = os.path.join(self.img_dir, self.data[index]["img"])
        # image = Image.open(img_path).convert("RGB")

        # if self.transform is not None:
        #     image = self.transform(image)
        text = self.data[index]["text"]
        inputs = self.tokenizer.encode_plus(
            text,                      # Input sentence
            
            max_length=140,                # Pad or truncate all sentences
            return_tensors='pt',           # Return PyTorch tensors
            padding='max_length',
            truncation=True,
        )

        label = self.data[index]["label"]

        return inputs['input_ids'][0], inputs['attention_mask'][0], label


In [4]:
# Load the datasets
train_dataset = HatefulMemeDataset(
    json_file="data/train.jsonl",  tokenizer=tokenizer)
val_dataset = HatefulMemeDataset(
    json_file="data/dev_seen.jsonl",  tokenizer=tokenizer)
valu_dataset = HatefulMemeDataset(
    json_file="data/dev_unseen.jsonl",  tokenizer=tokenizer)
testA_dataset = HatefulMemeDataset(
    json_file="data/test_seen.jsonl", tokenizer=tokenizer)
testB_dataset = HatefulMemeDataset(
    json_file="data/test_unseen.jsonl", tokenizer=tokenizer)

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=32,
                          shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32,
                        shuffle=False, num_workers=4)
valu_loader = DataLoader(valu_dataset, batch_size=32,
                        shuffle=False, num_workers=4)
testA_loader = DataLoader(testA_dataset, batch_size=32,
                         shuffle=False, num_workers=4)
testB_loader = DataLoader(testB_dataset, batch_size=32,
                          shuffle=False, num_workers=4)


In [5]:
for inputs, masks, labels in train_loader:
    print(inputs.shape)
    print(masks.shape)
    print(labels.shape)
    break

torch.Size([32, 140])
torch.Size([32, 140])
torch.Size([32])


In [8]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=2)
optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_batch_size = 32
val_batch_size = 32
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_acc = 0
    num_train_steps = 0
    for inputs, attention_masks, labels in train_loader:
        optimizer.zero_grad()
        inputs, attention_masks, labels = inputs.to(device), attention_masks.to(device), labels.to(device)

        
        outputs = model(
            inputs, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        train_acc += (logits.argmax(1) == labels).sum().item()
        num_train_steps += 1

    # Evaluate the model on the validation set after each epoch
    model.eval()
    val_loss = 0
    val_acc = 0
    num_val_steps = 0

    with torch.no_grad():
        for inputs, attention_masks, labels in val_loader:
            inputs, attention_masks, labels = inputs.to(
                device), attention_masks.to(device), labels.to(device)

            outputs = model(inputs, attention_mask=attention_masks, labels=labels)
            loss = outputs[0]
            logits = outputs[1]

            val_loss += loss.item()
            val_acc += (logits.argmax(1) == labels).sum().item()
            num_val_steps += 1
        train_loss = train_loss / num_train_steps
        train_acc = train_acc / len(train_dataset)
        val_loss = val_loss / num_val_steps
        val_acc = val_acc / len(val_dataset)

        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('Train loss: {:.4f}, Train accuracy: {:.4f}'.format(
            train_loss, train_acc))
        print('Val loss: {:.4f}, Val accuracy: {:.4f}'.format(val_loss, val_acc))
        
    

    


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/10
Train loss: 0.7134, Train accuracy: 0.4320
Val loss: 0.6996, Val accuracy: 0.4860
Epoch 2/10
Train loss: 0.7137, Train accuracy: 0.4264
Val loss: 0.6996, Val accuracy: 0.4860
Epoch 3/10
Train loss: 0.7135, Train accuracy: 0.4334
Val loss: 0.6996, Val accuracy: 0.4860
Epoch 4/10
Train loss: 0.7137, Train accuracy: 0.4356
Val loss: 0.6996, Val accuracy: 0.4860


KeyboardInterrupt: 

In [7]:
model.eval()
test_loss = 0
test_acc = 0
num_test_steps = 0

with torch.no_grad():
    for inputs, attention_masks, labels in testA_loader:
        inputs, attention_masks, labels = inputs.to(
            device), attention_masks.to(device), labels.to(device)

        outputs = model(
            inputs, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        logits = outputs[1]

        test_loss += loss.item()
        test_acc += (logits.argmax(1) == labels).sum().item()
        num_test_steps += 1
        
    test_loss = test_loss / num_test_steps
    test_acc = test_acc / len(testA_dataset)

    print('Test loss: {:.4f}, Test accuracy: {:.4f}'.format(test_loss, test_acc))


Test loss: 0.6985, Test accuracy: 0.4990
