# Multi-Modal Data Preparation

In [160]:
# Import necessary libraries
import os
import re
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
from torch.utils.data import DataLoader
import ast


In [161]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [162]:
clean_data = pd.read_csv("dataset/dataset_transformed.csv")
clean_data.head()

Unnamed: 0,tweet_text,image_path,labels,input_ids,attention_mask,transformed_image_path
0,nigga,dataset/images/0.jpg,"[4, 1, 3]","tensor([ 101, 9152, 23033, 102, 0, ...","tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",dataset/transformed_images\0.pt
1,my horses are retarded,dataset/images/1.jpg,"[5, 5, 5]","tensor([ 101, 2026, 5194, 2024, 2128, 7559, 57...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,...",dataset/transformed_images\1.pt
2,nigga on ma momma youngboy be spitting real sh...,dataset/images/2.jpg,"[0, 0, 0]","tensor([ 101, 9152, 23033, 2006, 5003, 236...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",dataset/transformed_images\2.pt
3,rt xxsugvngxx i ran into this holy nigga today,dataset/images/3.jpg,"[1, 0, 0]","tensor([ 101, 19387, 22038, 6342, 2290, 160...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",dataset/transformed_images\3.pt
4,everybody calling you nigger now,dataset/images/4.jpg,"[1, 0, 1]","tensor([ 101, 7955, 4214, 2017, 9152, 133...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,...",dataset/transformed_images\4.pt


In [163]:
# Create directories to store tensors
os.makedirs("dataset/input_ids", exist_ok=True)
os.makedirs("dataset/attention_masks", exist_ok=True)

In [164]:
# Tokenize and save tensors
for idx in range(len(clean_data)):
    row = clean_data.iloc[idx]
    
    # Tokenize text
    inputs = tokenizer(
        row["tweet_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Remove batch dimension to get [128]
    input_ids = inputs["input_ids"].squeeze(0)
    attention_mask = inputs["attention_mask"].squeeze(0)
    
    # Save tensors to files
    torch.save(inputs["input_ids"], f"dataset/input_ids/{idx}.pt")
    torch.save(inputs["attention_mask"], f"dataset/attention_masks/{idx}.pt")
    
    # Update CSV with file paths (not tensor strings)
    clean_data.at[idx, 'input_ids'] = f"input_ids/{idx}.pt"
    clean_data.at[idx, 'attention_mask'] = f"attention_masks/{idx}.pt"

In [165]:
# Convert labels from string representation to list


def parse_labels(label_value):
    """Convert worker labels into multi-hot encoding of length 6"""
    # Convert from string to list of ints if needed
    if isinstance(label_value, str):
        cleaned = re.sub(r"[^0-9]", " ", label_value)
        parts = [int(x) for x in cleaned.split() if x.strip()]
    elif isinstance(label_value, list):
        parts = label_value
    else:
        parts = []

    # Convert to multi-hot vector
    multi_hot = [0] * 6
    for label in parts:
        if 0 <= label <= 5:
            multi_hot[label] = 1
    return multi_hot



"""
def parse_labels(label_value):
    if isinstance(label_value, list):
        return label_value  # Already correct format
    # Clean strings like "[4, 1, 3]" or "4 1 3"
    cleaned = re.sub(r"[^0-9]", " ", str(label_value))
    parts = [int(x) for x in cleaned.split() if x.strip()]
    return (parts + [0, 0, 0])[:3]  # Ensure 3 elements
""" 

clean_data["labels"] = clean_data["labels"].apply(parse_labels)
print(clean_data["labels"].head())

# Should output:
# 0    [4.0, 1.0, 3.0]
# 1    [5.0, 5.0, 5.0]
# Name: labels, dtype: object



0    [0, 1, 0, 1, 1, 0]
1    [0, 0, 0, 0, 0, 1]
2    [1, 0, 0, 0, 0, 0]
3    [1, 1, 0, 0, 0, 0]
4    [1, 1, 0, 0, 0, 0]
Name: labels, dtype: object


In [166]:
"""
# Convert labels to string format
for idx in range(len(clean_data)):
    row = clean_data.iloc[idx]
    original_labels = parse_labels(row["labels"])
    clean_data.at[idx, 'labels'] = original_labels  # Store as list [4,1,3]


# Save updated CSV
clean_data["labels"] = clean_data["labels"].apply(str)
clean_data.to_csv("dataset/dataset_transformed.csv", index=False)

"""

# Cell 6: Final CSV formatting
clean_data.to_csv("dataset/dataset_transformed.csv", index=False)

In [167]:
# Define the custom dataset class
clean_data["transformed_image_path"] = clean_data["transformed_image_path"].str.replace("\\", "/")# Add this cell after loading clean_data


In [168]:
# Creating the MultiModalDataset class
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Load pre-saved tensors from files
        input_ids = torch.load(os.path.join("dataset", row["input_ids"])).squeeze(0)
        attention_mask = torch.load(os.path.join("dataset", row["attention_mask"])).squeeze(0)
        
        # Load image
        image = torch.load(row["transformed_image_path"])
        
        # Labels
        labels = torch.tensor(row["labels"], dtype=torch.float)
        
        return input_ids, attention_mask, image, labels


In [169]:
# Create the dataset
dataset = MultiModalDataset(clean_data, tokenizer)

In [170]:
# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Example: Iterate through the DataLoader
for batch in dataloader:
    input_ids, attention_mask, images, labels = batch
    print("Input IDs:", input_ids.shape)
    print("Attention Mask:", attention_mask.shape)
    print("Images:", images.shape)
    print("Labels:", labels.shape)
    break

Input IDs: torch.Size([32, 128])
Attention Mask: torch.Size([32, 128])
Images: torch.Size([32, 3, 224, 224])
Labels: torch.Size([32, 6])


In [171]:
print(pd.read_csv("dataset/dataset_transformed.csv")["labels"].head())
# Should show "[4, 1, 3]", not raw lists

0    [0, 1, 0, 1, 1, 0]
1    [0, 0, 0, 0, 0, 1]
2    [1, 0, 0, 0, 0, 0]
3    [1, 1, 0, 0, 0, 0]
4    [1, 1, 0, 0, 0, 0]
Name: labels, dtype: object


In [172]:
print(pd.read_csv("dataset/dataset_transformed.csv")[["input_ids", "attention_mask"]].head())
# Should show "input_ids/0.pt", not tensor strings

        input_ids        attention_mask
0  input_ids/0.pt  attention_masks/0.pt
1  input_ids/1.pt  attention_masks/1.pt
2  input_ids/2.pt  attention_masks/2.pt
3  input_ids/3.pt  attention_masks/3.pt
4  input_ids/4.pt  attention_masks/4.pt


In [173]:
sample = torch.load("dataset/input_ids/0.pt")
print(sample.shape)  # Should be torch.Size([128])

torch.Size([1, 128])
