In [6]:
import jsonlines

with jsonlines.open('data/train.jsonl') as reader:
    train_data = [obj for obj in reader]

with jsonlines.open('data/dev_seen.jsonl') as reader:
    val_data = [obj for obj in reader]

with jsonlines.open('data/test_seen.jsonl') as reader:
    test_data = [obj for obj in reader]


In [7]:
import torch
torch.cuda.empty_cache()

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os

# Define the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the hyperparameters
batch_size = 32
learning_rate = 0.0001
num_epochs = 10

# Define the dataset class


class MemesDataset(Dataset):
    def __init__(self, data, tokenizer, image_transform):
        self.data = data
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.img_dir = "data/"

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = os.path.join(self.img_dir, item['img'])
        image = Image.open(img_path).convert("RGB")
        image = self.image_transform(image)
        text = item['text']
        tokenized_text = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True)
        input_ids = torch.tensor(tokenized_text['input_ids'])
        attention_mask = torch.tensor(tokenized_text['attention_mask'])
        label = torch.tensor(item['label'])
        return {'image': image, 'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

# Define the image encoder


class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.cnn = models.resnet50(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 1024)

    def forward(self, x):
        x = self.cnn(x)
        x = nn.functional.relu(x)
        return x

# Define the text encoder


class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        return pooled_output

# Define the multimodal classifier




image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Load the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_data = MemesDataset(train_data, tokenizer, image_transform)
val_data = MemesDataset(val_data, tokenizer, image_transform)
test_data = MemesDataset(test_data, tokenizer, image_transform)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)




In [9]:
class MultimodalClassifier(nn.Module):
    def __init__(self):
        super(MultimodalClassifier, self).__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.fc1 = nn.Linear(1792, 512)
        self.fc2 = nn.Linear(512, 2)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, image, input_ids, attention_mask):
        image_embedding = self.image_encoder(image)
        text_embedding = self.text_encoder(input_ids, attention_mask)
        multimodal_embedding = torch.cat(
            (image_embedding, text_embedding), dim=1)
        # print(multimodal_embedding.shape)
        x = self.fc1(multimodal_embedding)
        x = nn.functional.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

model = MultimodalClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        image = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(image, input_ids, attention_mask)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * image.size(0)
        
    epoch_loss = running_loss / len(train_data)
    print('Epoch [{}/{}], Train Loss: {:.4f}'.format(epoch +
        1, num_epochs, epoch_loss))

    # Evaluate the model on the validation set
    model.eval()
    running_loss = 0.0
    num_correct = 0
    num_total = 0
    with torch.no_grad():
        for batch in val_loader:
            image = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)

            outputs = model(image, input_ids, attention_mask)
            loss = criterion(outputs, label)

            running_loss += loss.item() * image.size(0)
            num_correct += (torch.argmax(outputs, axis=1) == label).sum().item()
            num_total += label.size(0)

    val_loss = running_loss / len(val_data)
    val_acc = num_correct / num_total
    print('Epoch [{}/{}], Val Loss: {:.4f}, Val Acc: {:.4f}'.format(epoch +
        1, num_epochs, val_loss, val_acc))
        
    
        


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/10], Train Loss: 0.5581
Epoch [1/10], Val Loss: 0.7545, Val Acc: 0.5620
Epoch [2/10], Train Loss: 0.4602
Epoch [2/10], Val Loss: 1.0389, Val Acc: 0.5820
Epoch [3/10], Train Loss: 0.3812
Epoch [3/10], Val Loss: 0.8717, Val Acc: 0.5700
Epoch [4/10], Train Loss: 0.3114
Epoch [4/10], Val Loss: 1.2448, Val Acc: 0.5960
Epoch [5/10], Train Loss: 0.2129
Epoch [5/10], Val Loss: 1.5333, Val Acc: 0.5920
Epoch [6/10], Train Loss: 0.1424
Epoch [6/10], Val Loss: 1.4793, Val Acc: 0.5880
Epoch [7/10], Train Loss: 0.1159
Epoch [7/10], Val Loss: 1.8777, Val Acc: 0.5920
Epoch [8/10], Train Loss: 0.0839
Epoch [8/10], Val Loss: 2.2402, Val Acc: 0.5600
Epoch [9/10], Train Loss: 0.0754
Epoch [9/10], Val Loss: 2.1522, Val Acc: 0.5920
Epoch [10/10], Train Loss: 0.0848
Epoch [10/10], Val Loss: 2.2378, Val Acc: 0.5700
