In [1]:
!pip install torch torchvision transformers scikit-learn matplotlib pandas



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score

In [4]:
# Define dataset paths (update these paths to match your Drive structure)
ck_data_path = "/content/drive/My Drive/COSE474_DeepLearning/FinalProject_Experiment/emotion_dataset/image_dataset/"  # Path to CK+ dataset (image modality)
emotion_data_path = "/content/drive/My Drive/COSE474_DeepLearning/FinalProject_Experiment/emotion_dataset/text_emotion/"  # Path to Emotion Dataset CSVs

In [5]:
# Load Emotion Dataset CSV files
train_text_data = pd.read_csv(os.path.join(emotion_data_path, "training.csv"))
val_text_data = pd.read_csv(os.path.join(emotion_data_path, "validation.csv"))
test_text_data = pd.read_csv(os.path.join(emotion_data_path, "test.csv"))

In [6]:
# Image preprocessing for CK+ dataset
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize for CLIP compatibility
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


In [7]:
from torchvision.datasets import ImageFolder

# Load CK+ dataset
ck_dataset = ImageFolder(ck_data_path, transform=image_transform)

# Check CK+ dataset classes
print("CK+ Classes:", ck_dataset.classes)


CK+ Classes: ['anger', 'contempt', 'disgust', 'fear', 'happy', 'sadness', 'surprise']


In [8]:
class MultimodalEmotionDataset(Dataset):
    def __init__(self, text_data, image_data, labels, text_processor, image_transform):
        self.text_data = text_data
        self.image_data = image_data
        self.labels = labels
        self.text_processor = text_processor
        self.image_transform = image_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.text_data[idx]
        image_path, label = self.image_data[idx]
        image = Image.open(image_path).convert("RGB")  # Convert image to RGB

        # Apply transformations
        image = self.image_transform(image)
        text = self.text_processor(text=[text], return_tensors="pt", padding="max_length", truncation=True, max_length=50)

        return text, image, torch.tensor(label)


In [9]:
def custom_collate_fn(batch):
    texts = {key: torch.cat([item[0][key] for item in batch], dim=0) for key in batch[0][0]}
    images = torch.stack([item[1] for item in batch])
    labels = torch.stack([item[2] for item in batch])
    return texts, images, labels


In [10]:
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

train_multimodal_data = MultimodalEmotionDataset(
    text_data=train_text_data["text"].tolist(),
    image_data=ck_dataset.samples,
    labels=[sample[1] for sample in ck_dataset.samples],
    text_processor=clip_processor,
    image_transform=image_transform
)

val_multimodal_data = MultimodalEmotionDataset(
    text_data=val_text_data["text"].tolist(),
    image_data=ck_dataset.samples,
    labels=[sample[1] for sample in ck_dataset.samples],
    text_processor=clip_processor,
    image_transform=image_transform
)

# DataLoader with custom collate function
train_loader = DataLoader(train_multimodal_data, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_multimodal_data, batch_size=32, shuffle=False, collate_fn=custom_collate_fn)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

In [11]:
class MultimodalEmotionModel(nn.Module):
    def __init__(self, clip_model, num_classes):
        super(MultimodalEmotionModel, self).__init__()
        self.clip_model = clip_model
        self.fc = nn.Sequential(
            nn.Linear(1024, 512),  # Concatenated features (text + image) from CLIP model
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, text_inputs, image_inputs):
        # Get the embeddings (features) from the CLIP model
        text_features = self.clip_model.get_text_features(**text_inputs)  # Text features
        image_features = self.clip_model.get_image_features(image_inputs)  # Image features

        # Concatenate the text and image features along the feature dimension
        combined_features = torch.cat((text_features, image_features), dim=1)  # Concatenate both

        # Pass the combined features through the classifier
        return self.fc(combined_features)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
num_classes = len(ck_dataset.classes)  # Based on CK+ classes
model = MultimodalEmotionModel(clip_model, num_classes).to(device)


pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [14]:
def train_model_with_logs(model, train_loader, val_loader, epochs=5):
    results = []
    for epoch in range(epochs):
        model.train()
        train_loss, train_preds, train_labels = 0, [], []

        for batch in train_loader:
            # Unpack the batch correctly
            texts, images, labels = batch

            optimizer.zero_grad()

            # Move data to device
            images = images.to(device)
            labels = labels.to(device)
            texts = {k: v.to(device) for k, v in texts.items()}

            # Forward pass
            outputs = model(texts, images)  # outputs is now logits

            # Calculate loss
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        # Training metrics
        train_acc = accuracy_score(train_labels, train_preds)
        train_f1 = f1_score(train_labels, train_preds, average="weighted")

        # Validation phase
        model.eval()
        val_preds, val_labels = [], []
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                texts, images, labels = batch
                images = images.to(device)
                labels = labels.to(device)
                texts = {k: v.to(device) for k, v in texts.items()}

                outputs = model(texts, images)
                val_loss += criterion(outputs, labels).item()
                val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        # Validation metrics
        val_acc = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average="weighted")

        # Log results
        results.append({
            "Epoch": epoch + 1,
            "Train Loss": train_loss,
            "Train Accuracy": train_acc,
            "Train F1": train_f1,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_acc,
            "Validation F1": val_f1
        })

        # Print metrics
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Train F1={train_f1:.4f}, "
              f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}, Val F1={val_f1:.4f}")

    # Convert results to DataFrame
    return pd.DataFrame(results)

In [15]:
# Train the model
results_df = train_model_with_logs(model, train_loader, val_loader, epochs=5)

Epoch 1: Train Loss=57.6546, Train Acc=0.2345, Train F1=0.1713, Val Loss=57.3002, Val Acc=0.1804, Val F1=0.0552
Epoch 2: Train Loss=57.1192, Train Acc=0.2202, Train F1=0.1639, Val Loss=56.7707, Val Acc=0.1804, Val F1=0.0552
Epoch 3: Train Loss=56.9639, Train Acc=0.2324, Train F1=0.1616, Val Loss=56.7863, Val Acc=0.2538, Val F1=0.1028
Epoch 4: Train Loss=55.3775, Train Acc=0.2762, Train F1=0.1900, Val Loss=58.7184, Val Acc=0.2314, Val F1=0.1711


KeyboardInterrupt: 