# Multimodal Sentiment Analysis

In [1]:
import copy
import gc
import os
import random
from collections import defaultdict
from typing import Dict, Optional, Tuple

import albumentations as A
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
from albumentations.pytorch import ToTensorV2
from colorama import Back, Fore, Style
from PIL import Image
from sklearn.model_selection import StratifiedKFold
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torchmetrics import AUROC, Accuracy, F1Score, Precision, Recall
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

import warnings
warnings.simplefilter('ignore')

c_ = Fore.GREEN
sr_ = Style.RESET_ALL

%load_ext watermark
%watermark -v -a "aneesh-aparajit"

Author: aneesh-aparajit

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.9.0



# Config

In [42]:
class Config:
    seed = 101
    debug = False  # set debug=False for Full Training
    exp_name = "vit/sbert"
    model_name = "vit-sbert-multimodal"
    backbone = "google/vit-base-patch16-224+sentence-transformers/all-mpnet-base-v2"
    tokenizer = "sentence-transformers/all-mpnet-base-v2"
    image_encoder = "google/vit-base-patch16-224"
    train_bs = 24
    valid_bs = 48
    img_size = [224, 224]
    max_len = 128
    epochs = 50
    competition = "memotions-7k"

    # Optimizers
    optimizer     = 'Adam'
    learning_rate = 3e-4
    rho           = 0.9
    eps           = 1e-6
    lr_decay      = 0
    betas         = (0.9, 0.999)
    momentum      = 0
    alpha         = 0.99

    # Scheduler
    scheduler     = 'CosineAnnealingLR'
    min_lr        = 1e-6
    T_max         = int(30000/train_bs*epochs)+50
    T_0           = 25
    warmup_epochs = 0
    weight_decay  = 1e-6

    # Config
    n_accumulate  = max(1, 32//train_bs)
    num_folds     = 5
    num_classes   = 3

    device        = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reproducibility

In [3]:
def set_seed(seed: int = 42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(">>> SEEDED <<<")

set_seed(Config.seed)

>>> SEEDED <<<


# Utils

## Get Optimizer

In [4]:
def get_optimizer(model: nn.Module):
    """
    Returns the optimizer based on the Config files.
    """
    if Config.optimizer == "Adadelta":
        optimizer = optim.Adadelta(
            model.parameters(), lr=Config.learning_rate, rho=Config.rho, eps=Config.eps
        )
    elif Config.optimizer == "Adagrad":
        optimizer = optim.Adagrad(
            model.parameters(),
            lr=Config.learning_rate,
            lr_decay=Config.lr_decay,
            weight_decay=Config.weight_decay,
        )
    elif Config.optimizer == "Adam":
        optimizer = optim.Adam(
            model.parameters(),
            lr=Config.learning_rate,
            betas=Config.betas,
            eps=Config.eps,
        )
    elif Config.optimizer == "RMSProp":
        optimizer = optim.RMSprop(
            model.parameters(),
            lr=Config.learning_rate,
            alpha=Config.alpha,
            eps=Config.eps,
            weight_decay=Config.weight_decay,
            momentum=Config.momentum,
        )
    else:
        raise NotImplementedError(
            f"The optimizer {Config.optimizer} has not been implemented."
        )
    return optimizer

## Get Scheduler

In [5]:
def get_scheduler(optimizer: optim):
    """
    A method which returns the required schedulers.
        - Extracted from Awsaf's Kaggle.
    """
    if Config.scheduler == "CosineAnnealingLR":
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer=optimizer, T_max=Config.T_max, eta_min=Config.min_lr
        )
    elif Config.scheduler == "CosineAnnealingWarmRestarts":
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer=optimizer, T_0=Config.T_0, eta_min=Config.eta_min
        )
    elif Config.scheduler == "ReduceLROnPlateau":
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer,
            mode="min",
            factor=0.1,
            patience=10,
            threshold=0.0001,
            min_lr=Config.min_lr,
        )
    elif Config.scheduler == "ExponentialLR":
        scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.85)
    elif Config.scheduler is None:
        scheduler = None
    else:
        raise NotImplementedError(
            "The Scheduler you have asked has not been implemented"
        )
    return scheduler

# Data

## Create Folds

In [6]:
def create_folds():
    df = pd.read_csv('../memotion_dataset_7k/labels.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df.sample(frac=1).reset_index(drop=True)
    df['label'] = df['offensive']
    df['label'] = np.where(df['label'] == 'hateful_offensive', 'very_offensive', df['label'])
    
    mskf = StratifiedKFold(n_splits=5)

    df['kfold'] = -1
    for fold, (train, valid) in enumerate(mskf.split(X=df, y=df['label'])):
        df.loc[valid, 'kfold'] = fold
    
    df['label'] = df['label'].map({
        'not_offensive': 0, 
        'slight': 1, 
        'very_offensive': 2
    })

    df.to_csv('../memotion_dataset_7k/folds.csv', index=False)

## Sample Images
![memes](../resources/memes-preview.png)

## Dataset and DataLoaders

In [7]:
class MemotionDataset(Dataset):
    def __init__(self, df: pd.DataFrame) -> None:
        super().__init__()
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained(Config.tokenizer)
        self.transforms = A.Compose([
            A.Resize(height=Config.img_size[0], width=Config.img_size[1]),
            ToTensorV2(),
        ])

    def __len__(self) -> int:
        return self.df.shape[0]
    
    def __getitem__(self, ix: int) -> Dict[str, torch.Tensor]:
        row = self.df.iloc[ix]

        # Image
        image_path = os.path.join('../memotion_dataset_7k/images', row['image_name'].lower())
        img = np.array(Image.open(image_path).convert('RGB'))
        img = self.transforms(image=img)['image']

        # Text
        text = row['text_corrected'].lower()
        out = self.tokenizer(
            text=text, 
            max_length=Config.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        # __import__('pprint').pprint(out)

        return  {
            'image': img, 
            'input_ids': out['input_ids'].squeeze(),
            'attention_mask': out['attention_mask'].squeeze(),
            'label': torch.LongTensor([row['label']]).squeeze()
        }

# Model

## Image Encoder

In [8]:
class ImageEncoder(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.encoder = AutoModel.from_pretrained(Config.image_encoder)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.encoder.forward(x)["pooler_output"]
        return x

## Text Encoder

In [9]:
class TextEncoder(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.encoder = AutoModel.from_pretrained(Config.tokenizer)

    def forward(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        x = self.encoder.forward(input_ids=input_ids, attention_mask=attention_mask)
        return x["pooler_output"]

## Memotion Model

In [22]:
class MemotionModel(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.alpha_img = torch.randn(size=(1,), requires_grad=True)
        self.alpha_txt = torch.randn(size=(1,), requires_grad=True)
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 3)
        self.dropout = nn.Dropout(p=0.2)

    def forward(
        self, image: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor, label: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        img_out = self.image_encoder.forward(image)
        txt_out = self.text_encoder.forward(
            input_ids=input_ids, attention_mask=attention_mask
        )
        wt_emb = self.alpha_txt * txt_out + self.alpha_img * img_out
        x = self.fc1(self.dropout(wt_emb))
        x = self.fc2(self.dropout(x))
        return self.fc3(x)

wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


# Training

## Training One Epoch

In [43]:
def train_one_epoch(
    model: nn.Module,
    optimizer: optim,
    dataloader: DataLoader,
    scheduler=None,
) -> float:
    model.train()
    dataset_size = 0
    running_loss = 0

    criterion = nn.CrossEntropyLoss()
    accuracy_metric = Accuracy(task="multiclass", num_classes=Config.num_classes)
    precision_metric = Precision(task="multiclass", num_classes=Config.num_classes)
    recall_metric = Recall(task="multiclass", num_classes=Config.num_classes)
    auroc_metric = AUROC(task="multiclass", num_classes=Config.num_classes)
    f1_metrics = F1Score(task="multiclass", num_classes=Config.num_classes)

    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"(train) ")
    for step, batch in pbar:
        batch = {k: v.to(Config.device) for k, v in batch.items()}
        labels = batch["label"]
        yHat = model.forward(**batch)

        optimizer.zero_grad()
        loss = criterion(yHat, labels)
        loss.backward()
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

        running_loss += loss.item() * labels.shape[0]
        dataset_size += labels.shape[0]

        epoch_loss = running_loss / dataset_size

        out = torch.argmax(yHat, axis=1)
        accuracy = accuracy_metric(out, labels)
        precision = precision_metric(out, labels)
        recall = recall_metric(out, labels)
        auroc = auroc_metric(F.softmax(yHat, dim=1), labels)
        f1 = f1_metrics(yHat, labels)
        current_lr = optimizer.param_groups[0]["lr"]

        wandb.log(
            {
                "train/loss": epoch_loss,
                "train/accuracy": accuracy,
                "train/precision": precision,
                "train/recall": recall,
                "train/auroc": auroc,
                "train/f1": f1,
                "train/current_lr": current_lr,
            },
            step=step,
        )

        pbar.set_postfix(epoch_loss=f"{epoch_loss:.5f}", current_lr=f"{current_lr:.5f}")

    return epoch_loss

## Validating One Epoch

In [44]:
@torch.no_grad()
def validate_one_epoch(
    model: nn.Module, dataloader: DataLoader
) -> Tuple[float, dict]:
    model.train()
    dataset_size = 0
    running_loss = 0

    criterion = nn.CrossEntropyLoss()
    accuracy_metric = Accuracy(task="multiclass", num_classes=Config.num_classes)
    precision_metric = Precision(task="multiclass", num_classes=Config.num_classes)
    recall_metric = Recall(task="multiclass", num_classes=Config.num_classes)
    auroc_metric = AUROC(task="multiclass", num_classes=Config.num_classes)
    f1_metrics = F1Score(task="multiclass", num_classes=Config.num_classes)

    val_scores = defaultdict(list)

    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"(valid) ")
    for step, batch in pbar:
        batch = {k: v.to(Config.device) for k, v in batch.items()}
        labels = batch["label"]
        yHat = model.forward(**batch)

        loss = criterion(yHat, labels)

        running_loss += loss.item() * labels.shape[0]
        dataset_size += labels.shape[0]

        epoch_loss = running_loss / dataset_size

        out = torch.argmax(yHat, axis=1)
        accuracy = accuracy_metric(out, labels)
        precision = precision_metric(out, labels)
        recall = recall_metric(out, labels)
        auroc = auroc_metric(F.softmax(yHat, dim=1), labels)
        f1 = f1_metrics(yHat, labels)

        val_scores["accuracy"].append(accuracy)
        val_scores["precision"].append(precision)
        val_scores["recall"].append(recall)
        val_scores["auroc"].append(auroc)
        val_scores["f1"].append(f1)

        wandb.log(
            {
                "valid/loss": epoch_loss,
                "valid/accuracy": accuracy,
                "valid/precision": precision,
                "valid/recall": recall,
                "valid/auroc": auroc,
                "valid/f1": f1,
            },
            step=step,
        )

    return epoch_loss, val_scores

## Train one Fold

In [45]:
def run_training(
    model: nn.Module,
    optimizer: optim,
    trainloader: DataLoader,
    validloader: DataLoader,
    run: wandb,
    fold: int,
    scheduler: lr_scheduler = None,
) -> Tuple[nn.Module, defaultdict]:
    wandb.watch(models=[model], log_freq=100)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    best_epoch = -1
    history = defaultdict(list)

    for epoch in range(Config.epochs):
        gc.collect()
        print(f"\t\t\t\t########## EPOCH [{epoch+1}/{Config.epochs}] ##########")
        train_loss = train_one_epoch(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            dataloader=trainloader,
        )
        valid_loss, valid_scores = validate_one_epoch(
            model=model, dataloader=validloader
        )

        wandb.log(
            {
                "train/epoch/loss": train_loss,
                "valid/epoch/loss": valid_loss,
                "valid/epoch/accuracy": np.mean(valid_scores["accuracy"]),
                "valid/epoch/precision": np.mean(valid_scores["precision"]),
                "valid/epoch/recall": np.mean(valid_scores["recall"]),
                "valid/epoch/auroc": np.mean(valid_scores["auroc"]),
                "valid/epoch/f1": np.mean(valid_scores["f1"]),
                "current_lr": optimizer.param_groups[0]["lr"],
            }
        )

        history["accuracy"].append(np.mean(valid_scores["accuracy"]))
        history["precision"].append(np.mean(valid_scores["precision"]))
        history["recall"].append(np.mean(valid_scores["recall"]))
        history["auroc"].append(np.mean(valid_scores["auroc"]))
        history["f1"].append(np.mean(valid_scores["f1"]))

        print(
            f'Valid Accuracy: {np.mean(valid_scores["accuracy"]):.5f} | Valid Loss: {valid_loss:.5f}'
        )

        if valid_loss < best_loss:
            print(
                f"{c_}Validation Score Improved from {best_loss:.5f} to {valid_loss:.5f}"
            )
            best_epoch = epoch + 1
            best_loss = valid_loss
            run.summary["Best Loss"] = best_loss
            run.summary["Best Epoch"] = best_epoch
            run.summary["Best Accuracy"] = np.mean(valid_scores["accuracy"])
            run.summary["Best Precision"] = np.mean(valid_scores["precision"])
            run.summary["Best Recall"] = np.mean(valid_scores["recall"])
            run.summary["Best AUROC"] = np.mean(valid_scores["auroc"])
            run.summary["Best F1 Score"] = np.mean(valid_scores["f1"])

            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"../artifacts/models/best/best_epoch-{fold:02d}.bin"
            torch.save(obj=best_model_wts, f=PATH)
            wandb.save(PATH)
            print(f"MODEL SAVED!{sr_}")

        last_model_wts = copy.deepcopy(model.state_dict())
        PATH = f"../artifacts/models/last/last_epoch-{fold:02d}.bin"
        torch.save(last_model_wts, PATH)

    model.load_state_dict(best_model_wts, strict=True)
    torch.save(history, f=f"../artifacts/history/fold-{fold:02d}.pth")
    return model, history

## Run Training

In [15]:
df = pd.read_csv('../memotion_dataset_7k/folds.csv')
df.head()

Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,label,kfold
0,image_4112.jpg,YOU HAVE NEVER SEEN TITANIC?!? RU SRS? quickme...,YOU HAVE NEVER SEEN TITANIC?!? R U SRS? quickm...,very_funny,general,slight,not_motivational,positive,1,0
1,image_3710.jpg,SAYS THEY'VE BEEN REBUILDING FOR 5 YEARS #SCOT...,SAYS THEY'VE BEEN REBUILDING FOR 5 YEARS #SCOT...,very_funny,twisted_meaning,very_offensive,motivational,positive,2,0
2,image_3739.jpg,spiderman homecoming looks amazing,spiderman homecoming looks amazing,very_funny,general,not_offensive,not_motivational,positive,0,0
3,image_62.jpg,THANOS GETS ALL INFINITY STONES AT THE END OF ...,THANOS GETS ALL INFINITY STONES AT THE END OF ...,very_funny,general,not_offensive,not_motivational,positive,0,0
4,image_5891.jpg,I do not want pizza I want my oscar,I do not want pizza I want my oscar,funny,not_sarcastic,slight,not_motivational,neutral,1,0


In [17]:
def prepare_dataloaders(fold) -> Tuple[DataLoader]:
    df = pd.read_csv('../memotion_dataset_7k/folds.csv')
    train_df = df[df['kfold'] != fold].reset_index(drop=True)
    valid_df = df[df['kfold'] == fold].reset_index(drop=True)
    
    train_dataset = MemotionDataset(df=train_df)
    valid_dataset = MemotionDataset(df=valid_df)
    
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=Config.train_bs, shuffle=True)
    valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=Config.valid_bs, shuffle=False)
    
    return train_dataloader, valid_dataloader

In [19]:
%%time
train, valid = prepare_dataloaders(2)

CPU times: user 128 ms, sys: 23.9 ms, total: 152 ms
Wall time: 2.51 s


In [28]:
for fold in range(Config.num_folds):
    print('#'*15)
    print(f'### Fold [{fold+1}/{Config.num_folds}]')
    print('#'*15)
    
    run = wandb.init(
        project='multimodal-sentiment-analysis',
        config={k:v for k, v in dict(vars(Config)).items() if '__' not in k},
        name=f'FOLD-{fold+1}|MODEL-{Config.backbone}', 
        group=f'FOLD-{fold+1}|MODEL-{Config.backbone}'
    )
    
    trainloader, validloader = prepare_dataloaders(fold=fold)
    
    model = MemotionModel().to(Config.device)
    optimizer = get_optimizer(model=model)
    scheduler = get_scheduler(optimizer=optimizer)
    
    model, history = run_training(model=model, optimizer=optimizer, 
                                  trainloader=trainloader, validloader=validloader, 
                                  run=run, fold=fold, scheduler=scheduler)
    run.finish()

###############
### Fold [1/5]
###############


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016752754866683973, max=1.0…

Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


				########## EPOCH [1/50] ##########


(train) :   0%|                                         | 0/234 [00:15<?, ?it/s]
