<a href="https://www.kaggle.com/code/arifulhaquenoman/better-result?scriptVersionId=185519847" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mvsasingle/MVSA_Single/labelResultAll.txt
/kaggle/input/mvsasingle/MVSA_Single/data/1893.txt
/kaggle/input/mvsasingle/MVSA_Single/data/1711.txt
/kaggle/input/mvsasingle/MVSA_Single/data/4682.txt
/kaggle/input/mvsasingle/MVSA_Single/data/5064.txt
/kaggle/input/mvsasingle/MVSA_Single/data/3504.txt
/kaggle/input/mvsasingle/MVSA_Single/data/1269.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/3863.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/1773.txt
/kaggle/input/mvsasingle/MVSA_Single/data/623.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/559.txt
/kaggle/input/mvsasingle/MVSA_Single/data/3750.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/2008.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/1812.txt
/kaggle/input/mvsasingle/MVSA_Single/data/1093.txt
/kaggle/input/mvsasingle/MVSA_Single/data/2081.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/4417.txt
/kaggle/input/mvsasingle/MVSA_Single/data/3919.jpg
/kaggle/input/mvsasingle/MVSA_Single/data/4503.txt
/kaggle/input/mvsasingle/MVS

In [3]:
import os
import torch
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Data loading functions
def load_text_data(data_folder):
    texts = []
    filenames = sorted(os.listdir(data_folder), key=lambda x: int(x[:-4]) if x[:-4].isdigit() else x)
    for filename in filenames:
        if filename.endswith(".txt"):
            with open(os.path.join(data_folder, filename), 'r', encoding='latin-1') as file:
                text = file.read().strip()
                texts.append(text)
    return texts, filenames

In [6]:
def load_labels(result_file):
    labels = {}
    with open(result_file, 'r') as file:
        next(file)  # Skip header
        for line in file:
            parts = line.strip().split('\t')
            text_id = int(parts[0])
            text_label, image_label = parts[1].split(',')
            labels[text_id] = text_label.strip()
    return labels

In [7]:
def filter_existing_files(texts, filenames, labels, data_folder):
    existing_texts = []
    existing_images = []
    existing_labels = []
    for i, text in enumerate(texts):
        image_file = os.path.join(data_folder, f"{i+1}.jpg")
        if os.path.exists(image_file) and (i+1) in labels:
            existing_texts.append(text)
            existing_images.append(image_file)
            existing_labels.append(labels[i+1])
    return existing_texts, existing_images, existing_labels

In [8]:
# Paths
data_folder = "/kaggle/input/mvsasingle/MVSA_Single/data/"
result_file = '/kaggle/input/mvsasingle/MVSA_Single/labelResultAll.txt'

# Load data
texts, filenames = load_text_data(data_folder)
labels = load_labels(result_file)

# Filter existing files
texts, image_paths, labels = filter_existing_files(texts, filenames, labels, data_folder)

In [9]:
class MultimodalDataset(Dataset):
    def __init__(self, texts, images, labels, tokenizer, transform):
        self.texts = texts
        self.images = images
        self.labels = labels
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        image_path = self.images[idx]
        try:
            image = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"File not found: {image_path}")
            image = Image.new('RGB', (224, 224))  # Create a blank image

        label = self.labels[idx]

        encoded_text = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        image = self.transform(image)

        return {
            'text': encoded_text['input_ids'].squeeze(),
            'attention_mask': encoded_text['attention_mask'].squeeze(),
            'image': image,
            'label': torch.tensor(self.sentiment_to_label[label], dtype=torch.long)
        }

    sentiment_to_label = {'negative': 0, 'neutral': 1, 'positive': 2}

In [10]:
# Prepare tokenizer and transforms
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [12]:
# Split the data
train_texts, val_texts, train_images, val_images, train_labels, val_labels = train_test_split(
    texts, image_paths, labels, test_size=0.2, random_state=42, stratify=labels
)

In [13]:
# Create datasets and dataloaders
train_dataset = MultimodalDataset(train_texts, train_images, train_labels, tokenizer, train_transform)
val_dataset = MultimodalDataset(val_texts, val_images, val_labels, tokenizer, val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [14]:
class MultimodalSentimentModel(nn.Module):
    def __init__(self, bert_model, resnet_model, num_classes):
        super(MultimodalSentimentModel, self).__init__()
        self.text_model = bert_model
        self.image_model = resnet_model
        
        self.text_output_size = 768  # BERT output size
        self.image_output_size = 2048  # ResNet50 output size
        
        self.attention = nn.MultiheadAttention(self.text_output_size + self.image_output_size, 8)
        self.fc1 = nn.Linear(self.text_output_size + self.image_output_size, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask, image):
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)[1]
        image_output = self.image_model(image)
        
        combined = torch.cat((text_output, image_output), dim=1)
        combined = combined.unsqueeze(0)
        attn_output, _ = self.attention(combined, combined, combined)
        attn_output = attn_output.squeeze(0)
        
        x = self.fc1(attn_output)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [15]:
# Initialize models
bert_model = BertModel.from_pretrained('bert-base-uncased')
resnet_model = models.resnet50(pretrained=True)
resnet_model.fc = nn.Identity()  # Remove the final fully connected layer
model = MultimodalSentimentModel(bert_model, resnet_model, num_classes=3).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 147MB/s] 


In [16]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.1)

In [17]:
# Training loop
num_epochs = 50
best_val_loss = float('inf')
patience = 5
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    for batch in train_loader:
        input_ids = batch['text'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        train_total += labels.size(0)
        train_correct += predicted.eq(labels).sum().item()

    train_loss /= len(train_loader)
    train_acc = train_correct / train_total

    # Validation
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['text'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    val_loss /= len(val_loader)
    val_acc = val_correct / val_total

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss

Epoch 1/50:
Train Loss: 1.0715, Train Acc: 0.4098
Val Loss: 1.1138, Val Acc: 0.4132
Epoch 2/50:
Train Loss: 0.9717, Train Acc: 0.5197
Val Loss: 0.9387, Val Acc: 0.5575
Epoch 3/50:
Train Loss: 0.7936, Train Acc: 0.6444
Val Loss: 1.1436, Val Acc: 0.5108
Epoch 4/50:
Train Loss: 0.6193, Train Acc: 0.7486
Val Loss: 1.1461, Val Acc: 0.5347
Epoch 5/50:
Train Loss: 0.3993, Train Acc: 0.8421
Val Loss: 1.4310, Val Acc: 0.5542
Epoch 6/50:
Train Loss: 0.2815, Train Acc: 0.8991
Val Loss: 1.4902, Val Acc: 0.5477
Epoch 7/50:
Train Loss: 0.1298, Train Acc: 0.9623
Val Loss: 1.6259, Val Acc: 0.5575
Epoch 8/50:
Train Loss: 0.1195, Train Acc: 0.9623
Val Loss: 1.7532, Val Acc: 0.5607
Epoch 9/50:
Train Loss: 0.0894, Train Acc: 0.9704
Val Loss: 1.8764, Val Acc: 0.5640
Epoch 10/50:
Train Loss: 0.0822, Train Acc: 0.9740
Val Loss: 1.9534, Val Acc: 0.5640
Epoch 11/50:
Train Loss: 0.0744, Train Acc: 0.9772
Val Loss: 1.9595, Val Acc: 0.5607
Epoch 12/50:
Train Loss: 0.0744, Train Acc: 0.9750
Val Loss: 1.9567, Val A

In [19]:
def predict_sentiment(model, image_path, caption):
    model.eval()
    
    # Preprocess image
    image = Image.open(image_path).convert('RGB')
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    image = transform(image).unsqueeze(0).to(device)

    # Preprocess text
    encoded_text = tokenizer(caption, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask, image)
        probabilities = torch.softmax(outputs, dim=1)
        sentiment_scores = probabilities.tolist()[0]
        predicted = torch.argmax(outputs, dim=1)

    sentiment_labels = ['negative', 'neutral', 'positive']
    sentiment = sentiment_labels[predicted.item()]
    return sentiment, sentiment_scores

# Example usage
image_path = '/kaggle/input/mvsasingle/MVSA_Single/data/80.jpg'
caption = 'Grab your brain mittens twitter I\'m super ecstatic to be at work.. ??'
sentiment, scores = predict_sentiment(model, image_path, caption)
print(f"Predicted sentiment: {sentiment}")
print(f"Sentiment scores: Negative: {scores[0]:.4f}, Neutral: {scores[1]:.4f}, Positive: {scores[2]:.4f}")

Predicted sentiment: negative
Sentiment scores: Negative: 0.9877, Neutral: 0.0000, Positive: 0.0122


In [21]:
for i in range(1, 26):  # Iterate from 1 to 25
    image_path = f'/kaggle/input/mvsasingle/MVSA_Single/data/{i}.jpg'
    caption = texts[i - 1]  # texts list is 0-indexed, i starts from 1
    actual_label = labels[i]  # Assuming labels dictionary is 1-indexed

    sentiment, scores = predict_sentiment(model, image_path, caption)

    # Convert scores to percentages
    total_score = sum(scores)
    scores_percentage = [score / total_score * 100 for score in scores]

    # Print results
    print(f"Index: {i}")
    print(f"Actual Label (Image): {actual_label}")
    print(f"Actual Caption: {caption}")
    print(f"Predicted Sentiment: {sentiment}")
    print(f"Sentiment Scores: Negative: {scores_percentage[0]:.2f}%, Neutral: {scores_percentage[1]:.2f}%, Positive: {scores_percentage[2]:.2f}%")
    print("-" * 50)

Index: 1
Actual Label (Image): 0
Actual Caption: How I feel today #legday #jelly #aching #gym
Predicted Sentiment: neutral
Sentiment Scores: Negative: 0.01%, Neutral: 99.98%, Positive: 0.00%
--------------------------------------------------
Index: 2
Actual Label (Image): 1
Actual Caption: grattis min griskulting!!!???? va bara tvungen oki s? sch ? @ingenkommeratttrodig #pig #happybday #wow #lovely #cut¡­
Predicted Sentiment: neutral
Sentiment Scores: Negative: 0.05%, Neutral: 99.93%, Positive: 0.02%
--------------------------------------------------
Index: 3
Actual Label (Image): 2
Actual Caption: RT @polynminion: The moment I found my favourite tV character. #PROFOUNDLOVE
Predicted Sentiment: neutral
Sentiment Scores: Negative: 0.72%, Neutral: 99.27%, Positive: 0.01%
--------------------------------------------------
Index: 4
Actual Label (Image): 0
Actual Caption: #escort We have a young and energetic team and we pride ourselves on offering the highes #hoer
Predicted Sentiment: posi