In [11]:
import pandas as pd

# Load the label file
df = pd.read_csv("MVSA/labelResultAll.txt", sep="\t")

# Show first 5 rows
print(df.head())


     ID         text,image       text,image.1       text,image.2
0  2499  positive,positive    neutral,neutral  positive,positive
1  2500   neutral,positive  positive,positive    neutral,neutral
2  2501  negative,negative    neutral,neutral  positive,positive
3  2502  positive,positive  positive,positive  positive,positive
4  2503   positive,neutral  positive,negative  positive,positive


In [43]:
print("Total samples:", len(df))
print("\nColumn names:")
print(df.columns)


Total samples: 19600

Column names:
Index(['\ID', 'text,image', 'text,image.1', 'text,image.2'], dtype='object')


In [45]:
import numpy as np

# 1) Rename the first column to a clean 'ID'
id_old_name = df.columns[0]
df = df.rename(columns={id_old_name: "ID"})

# 2) The next three columns are the raw annotator columns
annotator_raw_cols = df.columns[1:4]   # assumes structure: ID, text,image, text,image.1, text,image.2
print("Annotator columns:", annotator_raw_cols.tolist())

# 3) Create a new clean table for labels
labels_df = pd.DataFrame()
labels_df["ID"] = df["ID"].astype(str).str.strip()

# 4) For each annotator column, split into ann{i}_text and ann{i}_image
for i, col in enumerate(annotator_raw_cols, start=1):
    s = df[col].astype(str).str.strip()
    split_cols = s.str.split(",", n=1, expand=True)

    # If there is only one part (no comma), make the second part NaN
    if split_cols.shape[1] == 1:
        split_cols[1] = np.nan

    labels_df[f"ann{i}_text"]  = split_cols[0].str.strip()
    labels_df[f"ann{i}_image"] = split_cols[1].str.strip()

# 5) Check the result: this is what you'll use from now on
print(labels_df.head())
print("\nColumns in labels_df:", labels_df.columns.tolist())


Annotator columns: ['text,image', 'text,image.1', 'text,image.2']
     ID ann1_text ann1_image ann2_text ann2_image ann3_text ann3_image
0  2499  positive   positive   neutral    neutral  positive   positive
1  2500   neutral   positive  positive   positive   neutral    neutral
2  2501  negative   negative   neutral    neutral  positive   positive
3  2502  positive   positive  positive   positive  positive   positive
4  2503  positive    neutral  positive   negative  positive   positive

Columns in labels_df: ['ID', 'ann1_text', 'ann1_image', 'ann2_text', 'ann2_image', 'ann3_text', 'ann3_image']


In [46]:
import numpy as np

# 1) List of all 6 label columns
label_cols = [
    "ann1_text", "ann1_image",
    "ann2_text", "ann2_image",
    "ann3_text", "ann3_image"
]

def majority_vote_row(row):
    # Collect all non-null labels for this row
    labels = []
    for col in label_cols:
        val = row[col]
        if pd.isna(val):
            continue
        val = str(val).strip().lower()
        if val in ["positive", "neutral", "negative"]:
            labels.append(val)
    
    # If no valid labels for some reason
    if not labels:
        return np.nan

    # Count frequency
    counts = pd.Series(labels).value_counts()
    max_count = counts.max()
    top_labels = counts[counts == max_count].index.tolist()

    # If only one clear winner, use it
    if len(top_labels) == 1:
        return top_labels[0]

    # Tie-break rule: prefer neutral if present, else just pick the first
    if "neutral" in top_labels:
        return "neutral"
    return top_labels[0]

# 2) Apply majority vote to each row to get final_label
labels_df["final_label"] = labels_df.apply(majority_vote_row, axis=1)

# 3) Inspect result
print(labels_df[["ID", "ann1_text", "ann2_text", "ann3_text", "final_label"]].head())

print("\nFinal label distribution:")
print(labels_df["final_label"].value_counts())


     ID ann1_text ann2_text ann3_text final_label
0  2499  positive   neutral  positive    positive
1  2500   neutral  positive   neutral     neutral
2  2501  negative   neutral  positive     neutral
3  2502  positive  positive  positive    positive
4  2503  positive  positive  positive    positive

Final label distribution:
final_label
positive    9367
neutral     9108
negative    1125
Name: count, dtype: int64


In [48]:
import os

# === 1) Make a clean labels table with only what you need for training ===
clean_labels = labels_df[["ID", "final_label"]].copy()

# Path to your MVSA folder (adjust if needed)
BASE_DIR = "/Users/sahilanjum/Desktop/MVSA"
LABELS_PATH = os.path.join(BASE_DIR, "labels_clean.csv")

# Save the clean labels to a CSV
clean_labels.to_csv(LABELS_PATH, index=False)
print(f"Saved clean labels to: {LABELS_PATH}")
print(clean_labels.head())


# === 2) (Optional but very useful) check that text + image files exist ===

# Adjust this if your data folder has a different name
DATA_DIR = os.path.join(BASE_DIR, "data")

def id_to_paths(id_str):
    txt_path = os.path.join(DATA_DIR, f"{id_str}.txt")
    img_path = os.path.join(DATA_DIR, f"{id_str}.jpg")
    return txt_path, img_path

print("\nChecking first 5 IDs for matching .txt and .jpg files:\n")
for _, row in clean_labels.head(5).iterrows():
    ID = str(row["ID"])
    txt_path, img_path = id_to_paths(ID)
    print(
        f"ID: {ID} | "
        f"txt exists: {os.path.exists(txt_path)} | "
        f"jpg exists: {os.path.exists(img_path)}"
    )
    # If you want to see the paths, uncomment this:
    print("  txt:", txt_path)
    print("  img:", img_path)


Saved clean labels to: /Users/sahilanjum/Desktop/MVSA/labels_clean.csv
     ID final_label
0  2499    positive
1  2500     neutral
2  2501     neutral
3  2502    positive
4  2503    positive

Checking first 5 IDs for matching .txt and .jpg files:

ID: 2499 | txt exists: True | jpg exists: True
  txt: /Users/sahilanjum/Desktop/MVSA/data/2499.txt
  img: /Users/sahilanjum/Desktop/MVSA/data/2499.jpg
ID: 2500 | txt exists: True | jpg exists: True
  txt: /Users/sahilanjum/Desktop/MVSA/data/2500.txt
  img: /Users/sahilanjum/Desktop/MVSA/data/2500.jpg
ID: 2501 | txt exists: True | jpg exists: True
  txt: /Users/sahilanjum/Desktop/MVSA/data/2501.txt
  img: /Users/sahilanjum/Desktop/MVSA/data/2501.jpg
ID: 2502 | txt exists: True | jpg exists: True
  txt: /Users/sahilanjum/Desktop/MVSA/data/2502.txt
  img: /Users/sahilanjum/Desktop/MVSA/data/2502.jpg
ID: 2503 | txt exists: True | jpg exists: True
  txt: /Users/sahilanjum/Desktop/MVSA/data/2503.txt
  img: /Users/sahilanjum/Desktop/MVSA/data/2503.j

In [49]:
clean_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19600 entries, 0 to 19599
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           19600 non-null  object
 1   final_label  19600 non-null  object
dtypes: object(2)
memory usage: 306.4+ KB


In [50]:
clean_labels.head()

Unnamed: 0,ID,final_label
0,2499,positive
1,2500,neutral
2,2501,neutral
3,2502,positive
4,2503,positive


In [53]:
import os
from PIL import Image

import torch
from torch.utils.data import Dataset
from torchvision import transforms
from transformers import AutoTokenizer

import pandas as pd

# 1) Load the clean labels file
labels_path = os.path.join(BASE_DIR, "labels_clean.csv")
labels_df = pd.read_csv(labels_path)

# 2) Map labels to integers for the model
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}

# 3) Text tokenizer (you can change the model name later)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 4) Image transforms (simple version, you can tweak later)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),                  # [0,1]
    transforms.Normalize(                   # ImageNet-style normalization
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])

class MVSADataset(Dataset):
    def __init__(self, labels_df, data_dir, tokenizer, image_transform=None, max_length=64):
        self.labels_df = labels_df.reset_index(drop=True)
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length

    def __len__(self):
        return len(self.labels_df)

    
    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        sample_id = str(row["ID"])
        label_str = row["final_label"]
        label = label2id[label_str]

        # --- Load text only ---
        txt_path = os.path.join(self.data_dir, f"{sample_id}.txt")
        with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()

        # Tokenize text
        text_enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Squeeze tokenizer batch dimension
        item = {
            "input_ids": text_enc["input_ids"].squeeze(0),
            "attention_mask": text_enc["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
            "id": sample_id,
        }
        return item

# 5) Create one dataset instance (you'll later split into train/val/test)
dataset = MVSADataset(
    labels_df=labels_df,
    data_dir=DATA_DIR,
    tokenizer=tokenizer,
    image_transform=image_transform,
    max_length=64,
)

# Quick sanity check
print("Dataset size:", len(dataset))
sample = dataset[0]
print("Keys in one sample:", sample.keys())
print("input_ids shape:", sample["input_ids"].shape)
print("label:", sample["label"], "→", id2label[sample["label"].item()])


Dataset size: 19600
Keys in one sample: dict_keys(['input_ids', 'attention_mask', 'label', 'id'])
input_ids shape: torch.Size([64])
label: tensor(2) → positive


In [55]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 1) Train/Val/Test split (70/15/15), stratified by label
train_df, temp_df = train_test_split(
    labels_df,
    test_size=0.30,
    stratify=labels_df["final_label"],
    random_state=42,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["final_label"],
    random_state=42,
)

print("Train size:", len(train_df))
print("Val size:  ", len(val_df))
print("Test size: ", len(test_df))

# 2) Create Dataset objects for each split
train_dataset = MVSADataset(
    labels_df=train_df,
    data_dir=DATA_DIR,
    tokenizer=tokenizer,
    image_transform=image_transform,
    max_length=64,
)

val_dataset = MVSADataset(
    labels_df=val_df,
    data_dir=DATA_DIR,
    tokenizer=tokenizer,
    image_transform=image_transform,
    max_length=64,
)

test_dataset = MVSADataset(
    labels_df=test_df,
    data_dir=DATA_DIR,
    tokenizer=tokenizer,
    image_transform=image_transform,
    max_length=64,
)

# 3) Create DataLoaders
BATCH_SIZE = 16

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)

print("Train batches:", len(train_loader))
print("Val batches:  ", len(val_loader))
print("Test batches: ", len(test_loader))


Train size: 13720
Val size:   2940
Test size:  2940
Train batches: 858
Val batches:   184
Test batches:  184


In [57]:
import torch
from torch import nn
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup

# 1) Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2) Text-only baseline model (BERT)
num_labels = 3  # negative, neutral, positive
text_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
)
text_model.to(device)

# 3) Optimizer, scheduler, loss
EPOCHS = 3
LR = 2e-5

optimizer = AdamW(text_model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps,
)

criterion = nn.CrossEntropyLoss()

# 4) Simple training + validation loop (text-only baseline)
def train_one_epoch(model, data_loader):
    model.train()
    total_loss = 0.0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


def eval_one_epoch(model, data_loader):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy


for epoch in range(EPOCHS):
    train_loss = train_one_epoch(text_model, train_loader)
    val_loss, val_acc = eval_one_epoch(text_model, val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f} | val_acc={val_acc:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu
Epoch 1/3 | train_loss=0.8189 | val_loss=0.7473 | val_acc=0.6303
Epoch 2/3 | train_loss=0.6994 | val_loss=0.7491 | val_acc=0.6255
Epoch 3/3 | train_loss=0.5774 | val_loss=0.8183 | val_acc=0.6143


In [59]:
import torch
from sklearn.metrics import classification_report, confusion_matrix

# Make sure we're in eval mode
text_model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

# Overall accuracy
correct = sum(int(p == y) for p, y in zip(all_preds, all_labels))
accuracy = correct / len(all_labels)
print(f"Test accuracy: {accuracy:.4f}")

# If you still have label2id / id2label from before:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}

target_names = ["negative", "neutral", "positive"]

print("\nClassification report:")
print(classification_report(all_labels, all_preds, target_names=target_names, digits=4))

print("\nConfusion matrix (rows=true, cols=pred):")
print(confusion_matrix(all_labels, all_preds))


Test accuracy: 0.6105

Classification report:
              precision    recall  f1-score   support

    negative     0.3750    0.3018    0.3344       169
     neutral     0.6154    0.5681    0.5908      1366
    positive     0.6273    0.6890    0.6567      1405

    accuracy                         0.6105      2940
   macro avg     0.5392    0.5196    0.5273      2940
weighted avg     0.6073    0.6105    0.6076      2940


Confusion matrix (rows=true, cols=pred):
[[ 51  82  36]
 [ 51 776 539]
 [ 34 403 968]]


In [65]:
import os
from PIL import Image, UnidentifiedImageError, ImageFile

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Allow loading truncated images (PIL will be less strict)
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Image transform (same as before)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),                  # [0, 1]
    transforms.Normalize(                   # standard ImageNet normalization
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])

# Label mapping (same as before)
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}


class MVSAImageDataset(Dataset):
    def __init__(self, labels_df, data_dir, image_transform=None):
        self.labels_df = labels_df.reset_index(drop=True)
        self.data_dir = data_dir
        self.image_transform = image_transform

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        sample_id = str(row["ID"])
        label_str = row["final_label"]
        label = label2id[label_str]

        img_path = os.path.join(self.data_dir, f"{sample_id}.jpg")

        # Try to load the image; if corrupted/missing/truncated, use a dummy black image
        try:
            image = Image.open(img_path)
            image = image.convert("RGB")
        except (FileNotFoundError, UnidentifiedImageError, OSError):
            # 224x224 black image as fallback
            image = Image.new("RGB", (224, 224), (0, 0, 0))

        if self.image_transform is not None:
            image = self.image_transform(image)

        return {
            "image": image,
            "label": torch.tensor(label, dtype=torch.long),
            "id": sample_id,
        }


# Create image-only datasets using the same splits as text
img_train_dataset = MVSAImageDataset(train_df, DATA_DIR, image_transform=image_transform)
img_val_dataset   = MVSAImageDataset(val_df,   DATA_DIR, image_transform=image_transform)
img_test_dataset  = MVSAImageDataset(test_df,  DATA_DIR, image_transform=image_transform)

# Image dataloaders (keep num_workers=0 on Mac to avoid multiprocessing issues)
IMG_BATCH_SIZE = 32

img_train_loader = DataLoader(img_train_dataset, batch_size=IMG_BATCH_SIZE, shuffle=True,  num_workers=0)
img_val_loader   = DataLoader(img_val_dataset,   batch_size=IMG_BATCH_SIZE, shuffle=False, num_workers=0)
img_test_loader  = DataLoader(img_test_dataset,  batch_size=IMG_BATCH_SIZE, shuffle=False, num_workers=0)

print("Image-only train batches:", len(img_train_loader))
print("Image-only val batches:  ", len(img_val_loader))
print("Image-only test batches: ", len(img_test_loader))



Image-only train batches: 429
Image-only val batches:   92
Image-only test batches:  92


In [67]:
import torch
from torch import nn
from torch.optim import AdamW
from torchvision import models
from transformers import get_linear_schedule_with_warmup

# 1) Device (reuse if already defined, otherwise this is safe)
if "device" not in globals():
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
print("Using device for image model:", device)

# 2) Image-only baseline model (ResNet18)
num_labels = 3  # negative, neutral, positive

img_model = models.resnet18(pretrained=True)
in_features = img_model.fc.in_features
img_model.fc = nn.Linear(in_features, num_labels)
img_model = img_model.to(device)

# 3) Optimizer, scheduler, loss
EPOCHS_IMG = 3
LR_IMG = 1e-4

img_optimizer = AdamW(img_model.parameters(), lr=LR_IMG)
img_criterion = nn.CrossEntropyLoss()

total_steps_img = len(img_train_loader) * EPOCHS_IMG
img_scheduler = get_linear_schedule_with_warmup(
    img_optimizer,
    num_warmup_steps=int(0.1 * total_steps_img),
    num_training_steps=total_steps_img,
)

# 4) Training + validation loops for image-only model

def train_one_epoch_img(model, data_loader):
    model.train()
    total_loss = 0.0

    for step, batch in enumerate(data_loader):
        img_optimizer.zero_grad()

        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        logits = model(images)
        loss = img_criterion(logits, labels)

        loss.backward()
        img_optimizer.step()
        img_scheduler.step()

        total_loss += loss.item()

        if (step + 1) % 100 == 0:
            print(f"  [img train] step {step+1}/{len(data_loader)}  loss={loss.item():.4f}")

    return total_loss / len(data_loader)


def eval_one_epoch_img(model, data_loader):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            images = batch["image"].to(device)
            labels = batch["label"].to(device)

            logits = model(images)
            loss = img_criterion(logits, labels)

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy


for epoch in range(EPOCHS_IMG):
    print(f"\n[Image-only] Epoch {epoch+1}/{EPOCHS_IMG}")
    train_loss_img = train_one_epoch_img(img_model, img_train_loader)
    val_loss_img, val_acc_img = eval_one_epoch_img(img_model, img_val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS_IMG} | train_loss={train_loss_img:.4f} | val_loss={val_loss_img:.4f} | val_acc={val_acc_img:.4f}")


Using device for image model: cpu

[Image-only] Epoch 1/3




  [img train] step 100/429  loss=0.9671
  [img train] step 200/429  loss=0.8587
  [img train] step 300/429  loss=0.9306
  [img train] step 400/429  loss=0.9194
Epoch 1/3 | train_loss=0.9350 | val_loss=0.8453 | val_acc=0.5548

[Image-only] Epoch 2/3
  [img train] step 100/429  loss=0.6460
  [img train] step 200/429  loss=0.8919
  [img train] step 300/429  loss=0.5579
  [img train] step 400/429  loss=0.6164
Epoch 2/3 | train_loss=0.7063 | val_loss=0.8799 | val_acc=0.5466

[Image-only] Epoch 3/3
  [img train] step 100/429  loss=0.4342
  [img train] step 200/429  loss=0.3832
  [img train] step 300/429  loss=0.4459
  [img train] step 400/429  loss=0.3990
Epoch 3/3 | train_loss=0.4469 | val_loss=0.9303 | val_acc=0.5476


In [68]:
import torch
from sklearn.metrics import classification_report, confusion_matrix

# Ensure eval mode
img_model.eval()

all_img_preds = []
all_img_labels = []

with torch.no_grad():
    for batch in img_test_loader:
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        logits = img_model(images)
        preds = torch.argmax(logits, dim=-1)

        all_img_preds.extend(preds.cpu().tolist())
        all_img_labels.extend(labels.cpu().tolist())

# Overall accuracy
correct_img = sum(int(p == y) for p, y in zip(all_img_preds, all_img_labels))
accuracy_img = correct_img / len(all_img_labels)
print(f"[Image-only] Test accuracy: {accuracy_img:.4f}")

# Label names (reuse mapping if already defined)
label2id = {"negative": 0, "neutral": 1, "positive": 2}
target_names = ["negative", "neutral", "positive"]

print("\n[Image-only] Classification report:")
print(classification_report(all_img_labels, all_img_preds, target_names=target_names, digits=4))

print("\n[Image-only] Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(all_img_labels, all_img_preds))


[Image-only] Test accuracy: 0.5527

[Image-only] Classification report:
              precision    recall  f1-score   support

    negative     0.3051    0.1065    0.1579       169
     neutral     0.5398    0.5703    0.5546      1366
    positive     0.5758    0.5893    0.5825      1405

    accuracy                         0.5527      2940
   macro avg     0.4736    0.4220    0.4317      2940
weighted avg     0.5435    0.5527    0.5451      2940


[Image-only] Confusion matrix (rows=true, cols=pred):
[[ 18  99  52]
 [ 29 779 558]
 [ 12 565 828]]


In [71]:
import os
from PIL import Image, UnidentifiedImageError, ImageFile

from torch.utils.data import Dataset, DataLoader

# Make sure truncated images don't crash us
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Reuse label2id, tokenizer, image_transform, DATA_DIR, train_df/val_df/test_df

class MVSAMultimodalDataset(Dataset):
    def __init__(self, labels_df, data_dir, tokenizer, image_transform=None, max_length=64):
        self.labels_df = labels_df.reset_index(drop=True)
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        sample_id = str(row["ID"])
        label_str = row["final_label"]
        label = label2id[label_str]

        # --- Load text ---
        txt_path = os.path.join(self.data_dir, f"{sample_id}.txt")
        try:
            with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read().strip()
        except FileNotFoundError:
            text = ""  # fallback: empty text if missing

        text_enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        # --- Load image ---
        img_path = os.path.join(self.data_dir, f"{sample_id}.jpg")
        try:
            image = Image.open(img_path)
            image = image.convert("RGB")
        except (FileNotFoundError, UnidentifiedImageError, OSError):
            image = Image.new("RGB", (224, 224), (0, 0, 0))  # dummy black

        if self.image_transform is not None:
            image = self.image_transform(image)

        return {
            "input_ids": text_enc["input_ids"].squeeze(0),
            "attention_mask": text_enc["attention_mask"].squeeze(0),
            "image": image,
            "label": torch.tensor(label, dtype=torch.long),
            "id": sample_id,
        }


# Create multimodal datasets
mm_train_dataset = MVSAMultimodalDataset(
    train_df, DATA_DIR, tokenizer, image_transform=image_transform, max_length=64
)
mm_val_dataset = MVSAMultimodalDataset(
    val_df, DATA_DIR, tokenizer, image_transform=image_transform, max_length=64
)
mm_test_dataset = MVSAMultimodalDataset(
    test_df, DATA_DIR, tokenizer, image_transform=image_transform, max_length=64
)

# Multimodal loaders (batch smaller because text+image together are heavier)
MM_BATCH_SIZE = 16

mm_train_loader = DataLoader(mm_train_dataset, batch_size=MM_BATCH_SIZE, shuffle=True,  num_workers=0)
mm_val_loader   = DataLoader(mm_val_dataset,   batch_size=MM_BATCH_SIZE, shuffle=False, num_workers=0)
mm_test_loader  = DataLoader(mm_test_dataset,  batch_size=MM_BATCH_SIZE, shuffle=False, num_workers=0)

print("Multimodal train batches:", len(mm_train_loader))
print("Multimodal val batches:  ", len(mm_val_loader))
print("Multimodal test batches: ", len(mm_test_loader))


Multimodal train batches: 858
Multimodal val batches:   184
Multimodal test batches:  184


In [73]:
import torch
from torch import nn
from torch.optim import AdamW
from torchvision import models
from transformers import AutoModel, get_linear_schedule_with_warmup

# 1) Device (reuse if already defined)
if "device" not in globals():
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
print("Using device for multimodal model:", device)

num_labels = 3  # negative, neutral, positive

# 2) Multimodal model: BERT (text) + ResNet18 (image) + fusion head
class MultimodalSentimentModel(nn.Module):
    def __init__(self, text_model_name="bert-base-uncased", num_labels=3):
        super().__init__()
        # Text encoder (CLS embedding)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_hidden = self.text_encoder.config.hidden_size

        # Image encoder (ResNet18 without final FC)
        self.img_encoder = models.resnet18(pretrained=True)
        img_hidden = self.img_encoder.fc.in_features
        self.img_encoder.fc = nn.Identity()

        # Fusion + classifier
        fusion_dim = text_hidden + img_hidden
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_labels),
        )

    def forward(self, input_ids, attention_mask, image):
        # Text: CLS token embedding
        text_out = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        # CLS token is position 0
        text_cls = text_out.last_hidden_state[:, 0, :]  # (batch, hidden)

        # Image features
        img_feat = self.img_encoder(image)  # (batch, img_hidden)

        # Fuse
        fused = torch.cat([text_cls, img_feat], dim=1)  # (batch, fusion_dim)
        logits = self.classifier(fused)
        return logits

mm_model = MultimodalSentimentModel(num_labels=num_labels).to(device)

# 3) Class weights from train_df (to handle imbalance)
class_counts = train_df["final_label"].value_counts()
print("Train label counts:\n", class_counts)

labels_order = ["negative", "neutral", "positive"]
total = len(train_df)
weights_list = []
for lbl in labels_order:
    # simple inverse-frequency style: higher weight for rarer classes
    count = class_counts[lbl]
    w = total / (num_labels * count)
    weights_list.append(w)

class_weights = torch.tensor(weights_list, dtype=torch.float32).to(device)
print("Class weights (neg, neu, pos):", class_weights.tolist())

mm_criterion = nn.CrossEntropyLoss(weight=class_weights)

# 4) Optimizer & scheduler
EPOCHS_MM = 3
LR_MM = 2e-5  # you can tune this later

mm_optimizer = AdamW(mm_model.parameters(), lr=LR_MM)

total_steps_mm = len(mm_train_loader) * EPOCHS_MM
mm_scheduler = get_linear_schedule_with_warmup(
    mm_optimizer,
    num_warmup_steps=int(0.1 * total_steps_mm),
    num_training_steps=total_steps_mm,
)

# 5) Training + validation loops

def train_one_epoch_mm(model, data_loader):
    model.train()
    total_loss = 0.0

    for step, batch in enumerate(data_loader):
        mm_optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            image=images,
        )
        loss = mm_criterion(logits, labels)

        loss.backward()
        mm_optimizer.step()
        mm_scheduler.step()

        total_loss += loss.item()

        if (step + 1) % 100 == 0:
            print(f"  [mm train] step {step+1}/{len(data_loader)}  loss={loss.item():.4f}")

    return total_loss / len(data_loader)


def eval_one_epoch_mm(model, data_loader):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            images = batch["image"].to(device)
            labels = batch["label"].to(device)

            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                image=images,
            )
            loss = mm_criterion(logits, labels)

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct / total if total > 0 else 0.0
    return avg_loss, accuracy


for epoch in range(EPOCHS_MM):
    print(f"\n[Multimodal] Epoch {epoch+1}/{EPOCHS_MM}")
    train_loss_mm = train_one_epoch_mm(mm_model, mm_train_loader)
    val_loss_mm, val_acc_mm = eval_one_epoch_mm(mm_model, mm_val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS_MM} | train_loss={train_loss_mm:.4f} | val_loss={val_loss_mm:.4f} | val_acc={val_acc_mm:.4f}")


Using device for multimodal model: cpu




Train label counts:
 final_label
positive    6557
neutral     6376
negative     787
Name: count, dtype: int64
Class weights (neg, neu, pos): [5.811097145080566, 0.7172731161117554, 0.6974734663963318]

[Multimodal] Epoch 1/3
  [mm train] step 100/858  loss=1.1394
  [mm train] step 200/858  loss=0.9574
  [mm train] step 300/858  loss=0.8651
  [mm train] step 400/858  loss=0.7194
  [mm train] step 500/858  loss=1.0653
  [mm train] step 600/858  loss=0.7991
  [mm train] step 700/858  loss=0.9953
  [mm train] step 800/858  loss=1.0416
Epoch 1/3 | train_loss=0.9382 | val_loss=0.8564 | val_acc=0.5401

[Multimodal] Epoch 2/3
  [mm train] step 100/858  loss=0.4485
  [mm train] step 200/858  loss=0.9615
  [mm train] step 300/858  loss=0.3626
  [mm train] step 400/858  loss=0.6684
  [mm train] step 500/858  loss=0.5358
  [mm train] step 600/858  loss=1.6227
  [mm train] step 700/858  loss=0.5253
  [mm train] step 800/858  loss=0.4777
Epoch 2/3 | train_loss=0.7322 | val_loss=0.8557 | val_acc=0.59

In [75]:
import torch
from sklearn.metrics import classification_report, confusion_matrix

# Make sure model is in eval mode
mm_model.eval()

all_mm_preds = []
all_mm_labels = []

with torch.no_grad():
    for batch in mm_test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        logits = mm_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            image=images,
        )
        preds = torch.argmax(logits, dim=-1)

        all_mm_preds.extend(preds.cpu().tolist())
        all_mm_labels.extend(labels.cpu().tolist())

# Overall accuracy
correct_mm = sum(int(p == y) for p, y in zip(all_mm_preds, all_mm_labels))
accuracy_mm = correct_mm / len(all_mm_labels)
print(f"[Multimodal] Test accuracy: {accuracy_mm:.4f}")

# Label names
label2id = {"negative": 0, "neutral": 1, "positive": 2}
target_names = ["negative", "neutral", "positive"]

print("\n[Multimodal] Classification report:")
print(classification_report(all_mm_labels, all_mm_preds, target_names=target_names, digits=4))

print("\n[Multimodal] Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(all_mm_labels, all_mm_preds))


[Multimodal] Test accuracy: 0.6126

[Multimodal] Classification report:
              precision    recall  f1-score   support

    negative     0.3038    0.5266    0.3853       169
     neutral     0.6262    0.5813    0.6029      1366
    positive     0.6657    0.6534    0.6595      1405

    accuracy                         0.6126      2940
   macro avg     0.5319    0.5871    0.5492      2940
weighted avg     0.6265    0.6126    0.6174      2940


[Multimodal] Confusion matrix (rows=true, cols=pred):
[[ 89  49  31]
 [142 794 430]
 [ 62 425 918]]
