In [1]:
!pip install pandas_path
!pip install fasttext
!pip install pytorch_lightning
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas_path
  Downloading pandas_path-0.3.0-py3-none-any.whl (8.4 kB)
Installing collected packages: pandas_path
Successfully installed pandas_path-0.3.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4393315 sha256=e34b174fcf297c5ae8e2a499af5b0084f4e73f679c1eb97d2452899858c6e438
  Stored in directory: /root/.cac

In [2]:
%matplotlib inline

import json
import logging
from pathlib import Path
import random
import tarfile
import tempfile
import warnings
import os
from PIL import Image

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_path  # Path style access for pandas
from tqdm import tqdm

import torch                    
import torchvision
import fasttext
import pytorch_lightning
from torch.utils.data import DataLoader, random_split
import torchvision.models as models
from transformers import BertModel
from transformers import BertTokenizer
from transformers import RobertaModel
from transformers import DistilBertModel
from transformers import RobertaTokenizer
from transformers import DistilBertTokenizer
import torch.optim as optim


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
project_directory = '/content/drive/MyDrive/HatefulMemes'
os.chdir(project_directory)

In [5]:
data_dir = Path.cwd()

train_path = "train.jsonl"
dev_path = "dev.jsonl"
test_path = "test.jsonl"

## Baseline:

This first experiment performs a mid-level concat fusion between a RESNET50 and a FCC over a fasttext embedding



*   Lr = 1e-4 Train Loss: 0.4472, Train Accuracy: 0.8631, Val Loss: 0.7825, Val Accuracy: 0.5100
*   Lr = 1e-5 Train Loss: 0.3442, Train Accuracy: 0.9691, Val Loss: 0.7883, Val Accuracy: 0.5060



In [None]:
class HatefulMemesDataset(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        image_transform,
        text_transform,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        self.dev_limit = dev_limit
        if balance:
            neg = self.samples_frame[
                self.samples_frame.label.eq(0)
            ]
            pos = self.samples_frame[
                self.samples_frame.label.eq(1)
            ]
            self.samples_frame = pd.concat(
                [
                    neg.sample(
                        pos.shape[0], 
                        random_state=random_state
                    ), 
                    pos
                ]
            )
        if self.dev_limit:
            if self.samples_frame.shape[0] > self.dev_limit:
                self.samples_frame = self.samples_frame.sample(
                    dev_limit, random_state=random_state
                )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        for path in self.samples_frame.img:
            if not path.exists():
                raise FileNotFoundError(f'{path} doesnt seem to exist')
            if not path.is_file():
                raise TypeError(f'{path} doesnt seem to be a file')
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        image = Image.open(
            self.samples_frame.loc[idx, "img"]
        ).convert("RGB")
        image = self.image_transform(image)

        text = torch.Tensor(
            self.text_transform.get_sentence_vector(
                self.samples_frame.loc[idx, "text"]
            )
        ).squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [None]:
def _build_text_transform(train_path, fasttext_model='cbow', embedding_dim=100):
    with tempfile.NamedTemporaryFile() as ft_training_data:
        ft_path = Path(ft_training_data.name)
        with ft_path.open("w") as ft:
            training_data = [
                json.loads(line)["text"] + "/n" 
                for line in open(train_path).read().splitlines()
            ]
            for line in training_data:
                ft.write(line + "\n")
            language_transform = fasttext.train_unsupervised(
                str(ft_path),
                model=fasttext_model,
                dim=embedding_dim
            )
    return language_transform

def _build_image_transform(image_dim=224):
    image_transform = torchvision.transforms.Compose(
        [
            torchvision.transforms.Resize(
                size=(image_dim, image_dim)
            ),        
            torchvision.transforms.ToTensor(),
            # all torchvision models expect the same
            # normalization mean and std
            # https://pytorch.org/docs/stable/torchvision/models.html
            torchvision.transforms.Normalize(
                mean=(0.485, 0.456, 0.406), 
                std=(0.229, 0.224, 0.225)
            ),
        ]
    )
    return image_transform

In [None]:
text_transform = _build_text_transform('train.jsonl')
image_transform = _build_image_transform()

In [None]:
dataset = HatefulMemesDataset(
    data_path='train.jsonl',
    img_dir=Path('.'),
    image_transform=image_transform,
    text_transform=text_transform,
    balance=False,
    dev_limit=None,
    random_state=0,
)

In [None]:
class LanguageAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        language_module,
        vision_module,
        language_feature_dim,
        vision_feature_dim,
        fusion_output_size,
        dropout_p,
        
    ):
        super(LanguageAndVisionConcat, self).__init__()
        self.language_module = language_module
        self.vision_module = vision_module
        self.fusion = torch.nn.Linear(
            in_features=(language_feature_dim + vision_feature_dim), 
            out_features=fusion_output_size
        )
        self.fc = torch.nn.Linear(
            in_features=fusion_output_size, 
            out_features=num_classes
        )
        self.loss_fn = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)
        
    def forward(self, text, image, label=None):
        text_features = torch.nn.functional.relu(
            self.language_module(text)
        )
        image_features = torch.nn.functional.relu(
            self.vision_module(image)
        )
        combined = torch.cat(
            [text_features, image_features], dim=1
        )

        fused = self.dropout(
            torch.nn.functional.relu(
            self.fusion(combined)
            )
        )
        logits = self.fc(fused)
        pred = torch.nn.functional.softmax(logits)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [None]:
params = {
    "lr": 1e-5, 
    "batch_size": 32,
    "num_epochs": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "balance": False,
    "dev_limit": None,
    "random_state": 0,
    "dropout": 0.2}


balance = params["balance"]
dev_limit = params["dev_limit"]
random_state = params["random_state"]
batch_size = params["batch_size"]
num_epochs = params["num_epochs"]
lr = params["lr"]
device = params["device"]
dropout = params["dropout"]
language_feature_dim = 300

training_data = HatefulMemesDataset(data_path=train_path,
    img_dir=Path('.'),
    image_transform=image_transform,
    text_transform=text_transform,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

validation_data = HatefulMemesDataset(data_path=dev_path,
    img_dir=Path('.'),
    image_transform=image_transform,
    text_transform=text_transform,
    balance = balance,
    dev_limit= dev_limit,
    random_state= random_state)

train_loader = DataLoader(training_data,
                          batch_size = batch_size,
                          shuffle= True,
                          num_workers = 2)

test_loader = DataLoader(validation_data,
                          batch_size = batch_size,
                          shuffle=True,
                          num_workers = 2)

#text_model = BertModel.from_pretrained('bert-base-uncased')
text_model = torch.nn.Linear(
                in_features= 100,
                out_features= language_feature_dim
        )
vision_model = models.resnet50(pretrained=True)

model = LanguageAndVisionConcat(
    num_classes = 2,
    loss_fn = torch.nn.CrossEntropyLoss(),
    language_module = text_model,
    vision_module = vision_model,
    language_feature_dim = language_feature_dim,
    vision_feature_dim = 1000,
    fusion_output_size = 512,
    dropout_p = 0.1
).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

loss_fn = torch.nn.CrossEntropyLoss()

best_val_acc = 0.0
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    for idx, batch in tqdm(enumerate(train_loader), total = len(train_loader)):
        images = batch["image"].to(device)
        texts = batch["text"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs, loss = model(texts, images, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
        train_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    train_loss /= len(train_loader.dataset)
    train_accuracy = train_correct.float() / len(train_loader.dataset)

    model.eval()

    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in test_loader:
            images = batch["image"].to(device)
            texts = batch["text"].to(device)
            labels = batch["label"].to(device)
            outputs, loss = model(texts, images, labels)
            val_loss += loss.item() * images.size(0)
            val_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    val_loss /= len(test_loader.dataset)
    val_accuracy = val_correct.float() / len(test_loader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
  



  pred = torch.nn.functional.softmax(logits)
  8%|▊         | 20/266 [01:23<17:01,  4.15s/it]


KeyboardInterrupt: ignored

## Experiment 1:

Let's use a BERT model to process text while keeping the same resnet50

- Lr = 1e-5 dropout = 0.2 FC_length = 250, Train Accuracy: 0.9682, Val Loss: 0.7317, Val Accuracy: 0.5700
- Lr = 1e-5 dropout = 0.2 FC_length = 50, Train Loss: 0.3419, Train Accuracy: 0.9740, Val Loss: 0.7518, Val Accuracy: 0.5400
- Lr = 1e-7 dropout = 0.2 FC_length = 50, Train Loss: 0.6517, Train Accuracy: 0.6382, Val Loss: 0.7157, Val Accuracy: 0.5040
- Lr = 1e-5 dropout = 0.2 FC_length = 50, batch_size = 64, +normalization_layer Train Loss: 0.3574, Train Accuracy: 0.9818, Val Loss: 0.7028, Val Accuracy: 0.5740

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def _build_image_transform(image_dim=224):
    image_transform = torchvision.transforms.Compose(
        [
            torchvision.transforms.Resize(
                size=(image_dim, image_dim)
            ),        
            torchvision.transforms.ToTensor(),
            # all torchvision models expect the same
            # normalization mean and std
            # https://pytorch.org/docs/stable/torchvision/models.html
            torchvision.transforms.Normalize(
                mean=(0.485, 0.456, 0.406), 
                std=(0.229, 0.224, 0.225)
            ),
        ]
    )
    return image_transform


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
text_transform = tokenizer
image_transform = _build_image_transform()

In [None]:
class HatefulMemesDataset(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        image_transform,
        text_transform,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        self.dev_limit = dev_limit
        if balance:
            neg = self.samples_frame[
                self.samples_frame.label.eq(0)
            ]
            pos = self.samples_frame[
                self.samples_frame.label.eq(1)
            ]
            self.samples_frame = pd.concat(
                [
                    neg.sample(
                        pos.shape[0], 
                        random_state=random_state
                    ), 
                    pos
                ]
            )
        if self.dev_limit:
            if self.samples_frame.shape[0] > self.dev_limit:
                self.samples_frame = self.samples_frame.sample(
                    dev_limit, random_state=random_state
                )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        for path in self.samples_frame.img:
            if not path.exists():
                raise FileNotFoundError(f'{path} doesnt seem to exist')
            if not path.is_file():
                raise TypeError(f'{path} doesnt seem to be a file')
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        image = Image.open(
            self.samples_frame.loc[idx, "img"]
        ).convert("RGB")
        image = self.image_transform(image)

        text = torch.Tensor(
            self.text_transform(
                self.samples_frame.loc[idx, "text"],
                max_length=20,
                padding='max_length',
                truncation=True, 
                return_tensors="pt"
            ).input_ids
        ).squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [None]:
class LanguageAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        language_module,
        vision_module,
        language_feature_dim,
        vision_feature_dim,
        fusion_output_size,
        dropout_p,
        
    ):
        super(LanguageAndVisionConcat, self).__init__()
        self.language_module = language_module
        self.vision_module = vision_module
        self.fusion = torch.nn.Linear(
            in_features=(language_feature_dim + vision_feature_dim), 
            out_features=fusion_output_size
        )
        self.norm = torch.nn.BatchNorm1d(fusion_output_size)
        self.fc = torch.nn.Linear(
            in_features=fusion_output_size, 
            out_features=num_classes
        )
        self.loss_fn = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)
        
    def forward(self, text, image, label=None):

        text_features = torch.nn.functional.relu(
            self.language_module(text).last_hidden_state.mean(dim=1)
        )
        image_features = torch.nn.functional.relu(
            self.vision_module(image)
        )
        combined = torch.cat(
            [text_features, image_features], dim=1
        )

        fused = self.dropout(
            torch.nn.functional.relu(
            self.fusion(combined)
            )
        )
        normalized = self.norm(fused)
        logits = self.fc(normalized)
        pred = torch.nn.functional.softmax(logits)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [None]:
params = {
    "lr": 1e-5, 
    "batch_size": 64,
    "num_epochs": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "balance": False,
    "dev_limit": None,
    "random_state": 0,
    "dropout": 0.2,
    "fusion_output_size": 50}


balance = params["balance"]
dev_limit = params["dev_limit"]
random_state = params["random_state"]
batch_size = params["batch_size"]
num_epochs = params["num_epochs"]
lr = params["lr"]
device = params["device"]
dropout = params["dropout"]
fusion_output_size = params["fusion_output_size"]
language_feature_dim = 768


training_data = HatefulMemesDataset(data_path=train_path,
    img_dir=Path('.'),
    image_transform=image_transform,
    text_transform=text_transform,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

validation_data = HatefulMemesDataset(data_path=dev_path,
    img_dir=Path('.'),
    image_transform=image_transform,
    text_transform=text_transform,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

train_loader = DataLoader(training_data,
                          batch_size = batch_size,
                          shuffle= True,
                          num_workers = 2)

test_loader = DataLoader(validation_data,
                          batch_size = batch_size,
                          shuffle=True,
                          num_workers = 2)

text_model = BertModel.from_pretrained('bert-base-uncased')
vision_model = models.resnet50(pretrained=True)

model = LanguageAndVisionConcat(
    num_classes = 2,
    loss_fn = torch.nn.CrossEntropyLoss(),
    language_module = text_model,
    vision_module = vision_model,
    language_feature_dim = language_feature_dim,
    vision_feature_dim = 1000,
    fusion_output_size = fusion_output_size,
    dropout_p = 0.1
).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

loss_fn = torch.nn.CrossEntropyLoss()

best_val_acc = 0.0
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    for idx, batch in tqdm(enumerate(train_loader), total = len(train_loader)):
        images = batch["image"].to(device)
        texts = batch["text"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs, loss = model(texts, images, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * images.size(0)
        train_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    train_loss /= len(train_loader.dataset)
    train_accuracy = train_correct.float() / len(train_loader.dataset)

    model.eval()

    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in test_loader:
            images = batch["image"].to(device)
            texts = batch["text"].to(device)
            labels = batch["label"].to(device)
            outputs, loss = model(texts, images, labels)
            val_loss += loss.item() * images.size(0)
            val_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    val_loss /= len(test_loader.dataset)
    val_accuracy = val_correct.float() / len(test_loader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
  

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  pred = torch.nn.functional.softmax(logits)
  2%|▏         | 2/133 [00:27<30:15, 13.86s/it]


KeyboardInterrupt: ignored

## Experiment 2: Only text

hypothesis: Resnet is overfitting the model, let's use only text model

- Lr = 1e-5 dropout = 0.2 FC_length = 50, batch_size = 64, +normalization_layer Train Loss: 0.5001, Train Accuracy: 0.8171, Val Loss: 0.7392, Val Accuracy: 0.5240
- Lr = 1e-5 dropout = 0.2 FC_length = 50, batch_size = 64, -last FCL, Train Loss: 0.5098, Train Accuracy: 0.7995, Val Loss: 0.7550, Val Accuracy: 0.5340
- Lr = 1e-5 dropout = 0.2 FC_length = 50, batch_size = 64, -last FCL, +balanced, Train Loss: 0.4983, Train Accuracy: 0.8031, Val Loss: 0.7113, Val Accuracy: 0.5800

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_transform = tokenizer

In [None]:
class HatefulMemesDataset(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        image_transform,
        text_transform,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        self.dev_limit = dev_limit
        if balance:
            neg = self.samples_frame[
                self.samples_frame.label.eq(0)
            ]
            pos = self.samples_frame[
                self.samples_frame.label.eq(1)
            ]
            self.samples_frame = pd.concat(
                [
                    neg.sample(
                        pos.shape[0], 
                        random_state=random_state
                    ), 
                    pos
                ]
            )
        if self.dev_limit:
            if self.samples_frame.shape[0] > self.dev_limit:
                self.samples_frame = self.samples_frame.sample(
                    dev_limit, random_state=random_state
                )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        for path in self.samples_frame.img:
            if not path.exists():
                raise FileNotFoundError(f'{path} doesnt seem to exist')
            if not path.is_file():
                raise TypeError(f'{path} doesnt seem to be a file')
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        text = torch.Tensor(
            self.text_transform(
                self.samples_frame.loc[idx, "text"],
                max_length=20,
                padding='max_length',
                truncation=True, 
                return_tensors="pt"
            ).input_ids
        ).squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id,
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [None]:
class LanguageAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        language_module,
        vision_module,
        language_feature_dim,
        vision_feature_dim,
        fusion_output_size,
        dropout_p,
        
    ):
        super(LanguageAndVisionConcat, self).__init__()
        self.language_module = language_module
        self.fusion = torch.nn.Linear(
            in_features=(language_feature_dim), 
            out_features=num_classes
        )
        #self.norm = torch.nn.BatchNorm1d(fusion_output_size)
        #self.fc = torch.nn.Linear(
        #    in_features=fusion_output_size, 
        #    out_features=num_classes
        #)
        self.loss_fn = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)
        
    def forward(self, text, image, label=None):

        text_features = torch.nn.functional.relu(
            self.language_module(text).last_hidden_state.mean(dim=1)
        )

        fused = self.dropout(
            torch.nn.functional.relu(
            self.fusion(text_features)
            )
        )
        #normalized = self.norm(fused)
        #logits = self.fc(text_features)
        pred = torch.nn.functional.softmax(fused)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [None]:
params = {
    "lr": 1e-5, 
    "batch_size": 64,
    "num_epochs": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "balance": True,
    "dev_limit": None,
    "random_state": 0,
    "dropout": 0.2,
    "fusion_output_size": 50}


balance = params["balance"]
dev_limit = params["dev_limit"]
random_state = params["random_state"]
batch_size = params["batch_size"]
num_epochs = params["num_epochs"]
lr = params["lr"]
device = params["device"]
dropout = params["dropout"]
fusion_output_size = params["fusion_output_size"]
language_feature_dim = 768


training_data = HatefulMemesDataset(data_path=train_path,
    img_dir=Path('.'),
    text_transform=text_transform,
    image_transform=None,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

validation_data = HatefulMemesDataset(data_path=dev_path,
    img_dir=Path('.'),
    text_transform=text_transform,
    image_transform=None,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

train_loader = DataLoader(training_data,
                          batch_size = batch_size,
                          shuffle= True,
                          num_workers = 2)

test_loader = DataLoader(validation_data,
                          batch_size = batch_size,
                          shuffle=True,
                          num_workers = 2)

text_model = BertModel.from_pretrained('bert-base-uncased')
vision_model = models.resnet50(pretrained=True)

model = LanguageAndVisionConcat(
    num_classes = 2,
    loss_fn = torch.nn.CrossEntropyLoss(),
    language_module = text_model,
    vision_module = None,
    language_feature_dim = language_feature_dim,
    vision_feature_dim = 1000,
    fusion_output_size = fusion_output_size,
    dropout_p = 0.1
).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

loss_fn = torch.nn.CrossEntropyLoss()

best_val_acc = 0.0
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    for idx, batch in tqdm(enumerate(train_loader), total = len(train_loader)):
        texts = batch["text"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs, loss = model(texts, None, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * texts.size(0)
        train_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    train_loss /= len(train_loader.dataset)
    train_accuracy = train_correct.float() / len(train_loader.dataset)

    model.eval()

    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in test_loader:
            texts = batch["text"].to(device)
            labels = batch["label"].to(device)
            outputs, loss = model(texts, None, labels)
            val_loss += loss.item() * texts.size(0)
            val_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    val_loss /= len(test_loader.dataset)
    val_accuracy = val_correct.float() / len(test_loader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
  

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  pred = torch.nn.functional.softmax(fused)
100%|██████████| 96/96 [00:21<00:00,  4.47it/s]


Epoch 1/10, Train Loss: 0.6683, Train Accuracy: 0.5841, Val Loss: 0.6786, Val Accuracy: 0.5840


100%|██████████| 96/96 [00:21<00:00,  4.38it/s]


Epoch 2/10, Train Loss: 0.6160, Train Accuracy: 0.6690, Val Loss: 0.6881, Val Accuracy: 0.5740


100%|██████████| 96/96 [00:23<00:00,  4.11it/s]


Epoch 3/10, Train Loss: 0.5885, Train Accuracy: 0.6982, Val Loss: 0.6919, Val Accuracy: 0.5860


100%|██████████| 96/96 [00:23<00:00,  4.10it/s]


Epoch 4/10, Train Loss: 0.5660, Train Accuracy: 0.7252, Val Loss: 0.6966, Val Accuracy: 0.5780


100%|██████████| 96/96 [00:22<00:00,  4.28it/s]


Epoch 5/10, Train Loss: 0.5470, Train Accuracy: 0.7497, Val Loss: 0.7096, Val Accuracy: 0.5780


100%|██████████| 96/96 [00:22<00:00,  4.31it/s]


Epoch 6/10, Train Loss: 0.5382, Train Accuracy: 0.7559, Val Loss: 0.7067, Val Accuracy: 0.5700


100%|██████████| 96/96 [00:22<00:00,  4.23it/s]


Epoch 7/10, Train Loss: 0.5227, Train Accuracy: 0.7744, Val Loss: 0.7062, Val Accuracy: 0.5860


100%|██████████| 96/96 [00:22<00:00,  4.28it/s]


Epoch 8/10, Train Loss: 0.5102, Train Accuracy: 0.7870, Val Loss: 0.7220, Val Accuracy: 0.5720


100%|██████████| 96/96 [00:22<00:00,  4.26it/s]


Epoch 9/10, Train Loss: 0.5058, Train Accuracy: 0.7956, Val Loss: 0.7224, Val Accuracy: 0.5660


100%|██████████| 96/96 [00:22<00:00,  4.30it/s]


Epoch 10/10, Train Loss: 0.4981, Train Accuracy: 0.7990, Val Loss: 0.7284, Val Accuracy: 0.5700


# Experiment 3:

- RoBERTa instead of BERT

-  "lr": 3e-5, batch_size = 64 Train Loss: 0.6507, Train Accuracy: 0.6279, Val Loss: 0.6948, Val Accuracy: 0.5580

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text_transform = tokenizer

In [None]:
class HatefulMemesDataset(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        image_transform,
        text_transform,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        self.dev_limit = dev_limit
        if balance:
            neg = self.samples_frame[
                self.samples_frame.label.eq(0)
            ]
            pos = self.samples_frame[
                self.samples_frame.label.eq(1)
            ]
            self.samples_frame = pd.concat(
                [
                    neg.sample(
                        pos.shape[0], 
                        random_state=random_state
                    ), 
                    pos
                ]
            )
        if self.dev_limit:
            if self.samples_frame.shape[0] > self.dev_limit:
                self.samples_frame = self.samples_frame.sample(
                    dev_limit, random_state=random_state
                )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        for path in self.samples_frame.img:
            if not path.exists():
                raise FileNotFoundError(f'{path} doesnt seem to exist')
            if not path.is_file():
                raise TypeError(f'{path} doesnt seem to be a file')
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        text = torch.Tensor(
            self.text_transform(
                self.samples_frame.loc[idx, "text"],
                max_length=20,
                padding='max_length',
                truncation=True, 
                return_tensors="pt"
            ).input_ids
        ).squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id,
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [None]:
class LanguageAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        language_module,
        vision_module,
        language_feature_dim,
        vision_feature_dim,
        fusion_output_size,
        dropout_p,
        
    ):
        super(LanguageAndVisionConcat, self).__init__()
        self.language_module = language_module
        self.fusion = torch.nn.Linear(
            in_features=(language_feature_dim), 
            out_features=num_classes
        )
        #self.norm = torch.nn.BatchNorm1d(fusion_output_size)
        #self.fc = torch.nn.Linear(
        #    in_features=fusion_output_size, 
        #    out_features=num_classes
        #)
        self.loss_fn = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)
        
    def forward(self, text, image, label=None):

        text_features = torch.nn.functional.relu(
            self.language_module(text).last_hidden_state.mean(dim=1)
        )

        fused = self.dropout(
            torch.nn.functional.relu(
            self.fusion(text_features)
            )
        )
        #normalized = self.norm(fused)
        #logits = self.fc(text_features)
        pred = torch.nn.functional.softmax(fused)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [None]:
params = {
    "lr": 1e-5, 
    "batch_size": 64,
    "num_epochs": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "balance": True,
    "dev_limit": None,
    "random_state": 0,
    "dropout": 0.2,
    "fusion_output_size": 50}


balance = params["balance"]
dev_limit = params["dev_limit"]
random_state = params["random_state"]
batch_size = params["batch_size"]
num_epochs = params["num_epochs"]
lr = params["lr"]
device = params["device"]
dropout = params["dropout"]
fusion_output_size = params["fusion_output_size"]
language_feature_dim = 768


training_data = HatefulMemesDataset(data_path=train_path,
    img_dir=Path('.'),
    text_transform=text_transform,
    image_transform=None,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

validation_data = HatefulMemesDataset(data_path=dev_path,
    img_dir=Path('.'),
    text_transform=text_transform,
    image_transform=None,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

train_loader = DataLoader(training_data,
                          batch_size = batch_size,
                          shuffle= True,
                          num_workers = 2)

test_loader = DataLoader(validation_data,
                          batch_size = batch_size,
                          shuffle=True,
                          num_workers = 2)

text_model = RobertaModel.from_pretrained('roberta-base')
vision_model = models.resnet50(pretrained=True)

model = LanguageAndVisionConcat(
    num_classes = 2,
    loss_fn = torch.nn.CrossEntropyLoss(),
    language_module = text_model,
    vision_module = None,
    language_feature_dim = language_feature_dim,
    vision_feature_dim = 1000,
    fusion_output_size = fusion_output_size,
    dropout_p = dropout
).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

loss_fn = torch.nn.CrossEntropyLoss()

best_val_acc = 0.0
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    for idx, batch in tqdm(enumerate(train_loader), total = len(train_loader)):
        texts = batch["text"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs, loss = model(texts, None, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * texts.size(0)
        train_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    train_loss /= len(train_loader.dataset)
    train_accuracy = train_correct.float() / len(train_loader.dataset)

    model.eval()

    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in test_loader:
            texts = batch["text"].to(device)
            labels = batch["label"].to(device)
            outputs, loss = model(texts, None, labels)
            val_loss += loss.item() * texts.size(0)
            val_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    val_loss /= len(test_loader.dataset)
    val_accuracy = val_correct.float() / len(test_loader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
  

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  pred = torch.nn.functional.softmax(fused)
100%|██████████| 96/96 [00:21<00:00,  4.41it/s]


Epoch 1/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.42it/s]


Epoch 2/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.43it/s]


Epoch 3/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.48it/s]


Epoch 4/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.44it/s]


Epoch 5/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.42it/s]


Epoch 6/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.46it/s]


Epoch 7/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.40it/s]


Epoch 8/10, Train Loss: 0.6931, Train Accuracy: 0.4998, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.46it/s]


Epoch 9/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


100%|██████████| 96/96 [00:21<00:00,  4.42it/s]


Epoch 10/10, Train Loss: 0.6931, Train Accuracy: 0.5000, Val Loss: 0.6931, Val Accuracy: 0.5000


## Experiment 4: 

Only RoBERTa with L2 Regularization

- Train Loss: 53.9597, Train Accuracy: 0.7385, Val Loss: 0.7145, Val Accuracy: 0.5740
- Train Accuracy: 0.5534, Val Loss: 0.6752, Val Accuracy: 0.5960

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text_transform = tokenizer

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
class HatefulMemesDataset(torch.utils.data.Dataset):
    """Uses jsonl data to preprocess and serve 
    dictionary of multimodal tensors for model input.
    """

    def __init__(
        self,
        data_path,
        img_dir,
        image_transform,
        text_transform,
        balance=False,
        dev_limit=None,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        self.dev_limit = dev_limit
        if balance:
            neg = self.samples_frame[
                self.samples_frame.label.eq(0)
            ]
            pos = self.samples_frame[
                self.samples_frame.label.eq(1)
            ]
            self.samples_frame = pd.concat(
                [
                    neg.sample(
                        pos.shape[0], 
                        random_state=random_state
                    ), 
                    pos
                ]
            )
        if self.dev_limit:
            if self.samples_frame.shape[0] > self.dev_limit:
                self.samples_frame = self.samples_frame.sample(
                    dev_limit, random_state=random_state
                )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        for path in self.samples_frame.img:
            if not path.exists():
                raise FileNotFoundError(f'{path} doesnt seem to exist')
            if not path.is_file():
                raise TypeError(f'{path} doesnt seem to be a file')
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        text = torch.Tensor(
            self.text_transform(
                self.samples_frame.loc[idx, "text"],
                max_length=20,
                padding='max_length',
                truncation=True, 
                return_tensors="pt"
            ).input_ids
        ).squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id,
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [8]:
class LanguageAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        language_module,
        vision_module,
        language_feature_dim,
        vision_feature_dim,
        fusion_output_size,
        dropout_p,
        
    ):
        super(LanguageAndVisionConcat, self).__init__()
        self.language_module = language_module
        self.fusion = torch.nn.Linear(
            in_features=(language_feature_dim), 
            out_features=num_classes
        )
        #self.norm = torch.nn.BatchNorm1d(fusion_output_size)
        #self.fc = torch.nn.Linear(
        #    in_features=fusion_output_size, 
        #    out_features=num_classes
        #)
        self.loss_fn = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)
        
    def forward(self, text, image, label=None):

        text_features = torch.nn.functional.relu(
            self.language_module(text).last_hidden_state.mean(dim=1)
        )

        fused = self.dropout(
            torch.nn.functional.relu(
            self.fusion(text_features)
            )
        )
        #normalized = self.norm(fused)
        #logits = self.fc(text_features)
        pred = torch.nn.functional.softmax(fused)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [18]:
params = {
    "lr": 1e-5, 
    "batch_size": 64,
    "num_epochs": 10,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "balance": True,
    "dev_limit": None,
    "random_state": 0,
    "dropout": 0.2,
    "fusion_output_size": 50}


balance = params["balance"]
dev_limit = params["dev_limit"]
random_state = params["random_state"]
batch_size = params["batch_size"]
num_epochs = params["num_epochs"]
lr = params["lr"]
device = params["device"]
dropout = params["dropout"]
fusion_output_size = params["fusion_output_size"]
language_feature_dim = 768
l2_lambda = 0.00001


training_data = HatefulMemesDataset(data_path=train_path,
    img_dir=Path('.'),
    text_transform=text_transform,
    image_transform=None,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

validation_data = HatefulMemesDataset(data_path=dev_path,
    img_dir=Path('.'),
    text_transform=text_transform,
    image_transform=None,
    balance= balance,
    dev_limit= dev_limit,
    random_state= random_state)

train_loader = DataLoader(training_data,
                          batch_size = batch_size,
                          shuffle= True,
                          num_workers = 2)

test_loader = DataLoader(validation_data,
                          batch_size = batch_size,
                          shuffle=True,
                          num_workers = 2)

text_model = RobertaModel.from_pretrained('roberta-base')
vision_model = models.resnet50(pretrained=True)

model = LanguageAndVisionConcat(
    num_classes = 2,
    loss_fn = torch.nn.CrossEntropyLoss(),
    language_module = text_model,
    vision_module = None,
    language_feature_dim = language_feature_dim,
    vision_feature_dim = 1000,
    fusion_output_size = fusion_output_size,
    dropout_p = dropout
).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

loss_fn = torch.nn.CrossEntropyLoss()

best_val_acc = 0.0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    for idx, batch in tqdm(enumerate(train_loader), total = len(train_loader)):
        texts = batch["text"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs, loss = model(texts, None, labels)
        l2_reg = 0
        for param in model.parameters():
            l2_reg += torch.norm(param)
        loss += l2_lambda * l2_reg
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * texts.size(0)
        train_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    train_loss /= len(train_loader.dataset)
    train_accuracy = train_correct.float() / len(train_loader.dataset)

    model.eval()

    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for batch in test_loader:
            texts = batch["text"].to(device)
            labels = batch["label"].to(device)
            outputs, loss = model(texts, None, labels)
            val_loss += loss.item() * texts.size(0)
            val_correct += torch.sum(torch.argmax(outputs, dim=1) == labels)

    val_loss /= len(test_loader.dataset)
    val_accuracy = val_correct.float() / len(test_loader.dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
  

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  pred = torch.nn.functional.softmax(fused)
100%|██████████| 96/96 [00:26<00:00,  3.64it/s]


Epoch 1/10, Train Loss: 0.7318, Train Accuracy: 0.5652, Val Loss: 0.7093, Val Accuracy: 0.5100


100%|██████████| 96/96 [00:26<00:00,  3.61it/s]


Epoch 2/10, Train Loss: 0.6950, Train Accuracy: 0.6272, Val Loss: 0.6881, Val Accuracy: 0.5780


100%|██████████| 96/96 [00:25<00:00,  3.70it/s]


Epoch 3/10, Train Loss: 0.6751, Train Accuracy: 0.6533, Val Loss: 0.6888, Val Accuracy: 0.5740


100%|██████████| 96/96 [00:26<00:00,  3.67it/s]


Epoch 4/10, Train Loss: 0.6611, Train Accuracy: 0.6702, Val Loss: 0.7019, Val Accuracy: 0.5620


100%|██████████| 96/96 [00:26<00:00,  3.65it/s]


Epoch 5/10, Train Loss: 0.6480, Train Accuracy: 0.6844, Val Loss: 0.6935, Val Accuracy: 0.5700


100%|██████████| 96/96 [00:26<00:00,  3.68it/s]


Epoch 6/10, Train Loss: 0.6399, Train Accuracy: 0.6949, Val Loss: 0.6754, Val Accuracy: 0.5960


100%|██████████| 96/96 [00:25<00:00,  3.69it/s]


Epoch 7/10, Train Loss: 0.6349, Train Accuracy: 0.6987, Val Loss: 0.6926, Val Accuracy: 0.5720


100%|██████████| 96/96 [00:25<00:00,  3.70it/s]


Epoch 8/10, Train Loss: 0.6271, Train Accuracy: 0.7131, Val Loss: 0.7161, Val Accuracy: 0.5680


100%|██████████| 96/96 [00:26<00:00,  3.68it/s]


Epoch 9/10, Train Loss: 0.6173, Train Accuracy: 0.7125, Val Loss: 0.7009, Val Accuracy: 0.5700


100%|██████████| 96/96 [00:26<00:00,  3.65it/s]


Epoch 10/10, Train Loss: 0.6057, Train Accuracy: 0.7370, Val Loss: 0.7003, Val Accuracy: 0.5740
