<a href="https://colab.research.google.com/github/abdulwasaeee/Vision-Language-model-for-emotions/blob/main/VLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import kagglehub

In [4]:
imgpath = kagglehub.dataset_download("msambare/fer2013")

Using Colab cache for faster access to the 'fer2013' dataset.


In [5]:
imgpath

'/kaggle/input/fer2013'

In [6]:
import os
os.listdir(imgpath)

['test', 'train']

In [7]:
import shutil
import os

src = "/kaggle/input/fer2013"
dst = "/kaggle/working/fer_clean"

# remove old copy if it exists
if os.path.exists(dst):
    shutil.rmtree(dst)

shutil.copytree(src, dst)

print("Copied to:", dst)
print("Train folders:", os.listdir(os.path.join(dst, "train")))


Copied to: /kaggle/working/fer_clean
Train folders: ['sad', 'fear', 'surprise', 'happy', 'neutral', 'angry', 'disgust']


In [8]:
remove = ["disgust", "surprise"]

trainroot = os.path.join(dst, "train")
testroot  = os.path.join(dst, "test")

for cls in remove:
    shutil.rmtree(os.path.join(trainroot, cls))
    shutil.rmtree(os.path.join(testroot, cls))


In [9]:
os.listdir(trainroot)

['sad', 'fear', 'happy', 'neutral', 'angry']

In [10]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((48, 48)),
    transforms.ToTensor(),
])


In [11]:
from torchvision.datasets import ImageFolder

traindataset = ImageFolder(
    root=os.path.join(dst, "train"),
    transform=transform
)

testdataset = ImageFolder(
    root=os.path.join(dst, "test"),
    transform=transform
)


In [12]:
img, label= traindataset[0]

In [13]:
traindataset.classes

['angry', 'fear', 'happy', 'neutral', 'sad']

In [14]:
label

0

In [15]:
img.shape

torch.Size([1, 48, 48])

In [16]:
textpath = kagglehub.dataset_download("abdallahwagih/emotion-dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/abdallahwagih/emotion-dataset?dataset_version_number=1...


100%|██████████| 213k/213k [00:00<00:00, 401kB/s]

Extracting files...





In [17]:
tp =os.listdir(textpath)

In [18]:
import pandas as pd
import numpy as np

In [19]:
textdata= pd.read_csv(os.path.join(textpath, 'Emotion_classify_Data.csv'))

In [20]:
textdata[:5]

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [21]:
textdata.Emotion.unique()

array(['fear', 'anger', 'joy'], dtype=object)

In [22]:
emotionmap= {'anger':0, "fear": 1, "joy": 2}

In [23]:
textdata["label"]=textdata.Emotion.map(emotionmap)
textdata[:5]

Unnamed: 0,Comment,Emotion,label
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,2
4,i feel suspicious if there is no one outside l...,fear,1


In [24]:
textdata= textdata.rename(columns={'Comment':"text", "Emotion": "emotion"})

In [25]:
textdata

Unnamed: 0,text,emotion,label
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,2
4,i feel suspicious if there is no one outside l...,fear,1
...,...,...,...
5932,i begun to feel distressed for you,fear,1
5933,i left feeling annoyed and angry thinking that...,anger,0
5934,i were to ever get married i d have everything...,joy,2
5935,i feel reluctant in applying there because i w...,fear,1


In [26]:
import torch
import torch.nn as nn

class VisionModel(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(start_dim=1),
            nn.Linear(64 * 12 * 12, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


In [27]:
from torch.utils.data import DataLoader

trainloader = DataLoader(
    traindataset,
    batch_size=64,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

testloader = DataLoader(
    testdataset,
    batch_size=64,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)


In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vm = VisionModel().to(device)

lossfn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(vm.parameters(), lr=1e-3)


In [29]:
device

device(type='cuda')

In [30]:
for epoch in range(10):
    vm.train()
    totalloss = 0
    correct = 0
    total = 0

    for images, labels in trainloader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = vm(images)
        loss = lossfn(outputs, labels)

        loss.backward()
        optimizer.step()

        totalloss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        train_acc = correct / total
    print(f"Epoch {epoch+1}: Loss={totalloss/len(trainloader):.4f}, Train Acc={train_acc:.4f}")


Epoch 1: Loss=1.5151, Train Acc=0.3403
Epoch 2: Loss=1.3709, Train Acc=0.4340
Epoch 3: Loss=1.3057, Train Acc=0.4645
Epoch 4: Loss=1.2482, Train Acc=0.4890
Epoch 5: Loss=1.2032, Train Acc=0.5111
Epoch 6: Loss=1.1569, Train Acc=0.5302
Epoch 7: Loss=1.1110, Train Acc=0.5578
Epoch 8: Loss=1.0583, Train Acc=0.5780
Epoch 9: Loss=1.0080, Train Acc=0.6018
Epoch 10: Loss=0.9513, Train Acc=0.6314


In [31]:
vm.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in testloader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = vm(images)
        _, preds = torch.max(outputs, 1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

test_acc = correct / total
print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.5115


In [32]:
torch.save(vm.state_dict(), "/kaggle/working/visionmodel.pt")


In [33]:
texts= textdata.text.tolist()
labels= textdata.label.tolist()

In [35]:
from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained("distilbert-base-uncased")

encodings= tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors="pt"
)

labels= torch.tensor(labels, dtype=torch.long)

In [36]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(
    encodings["input_ids"],
    encodings["attention_mask"],
    labels
)

loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [37]:
from transformers import AutoModel

class TextModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()

        self.encoder = AutoModel.from_pretrained("distilbert-base-uncased")

        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        cls = outputs.last_hidden_state[:, 0]  # CLS token
        return self.classifier(cls)


In [38]:
import torch.optim as optim
import torch.nn as nn
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tm = TextModel(num_classes=3).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(tm.parameters(), lr=2e-5)


for epoch in range(4):
    tm.train()
    total_loss = 0
    correct = 0
    total = 0

    for input_ids, attention_mask, labels in loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = tm(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    print(f"Epoch {epoch+1}: Loss={total_loss/len(loader):.4f}, Acc={acc:.4f}")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch 1: Loss=0.4581, Acc=0.8108
Epoch 2: Loss=0.0881, Acc=0.9695
Epoch 3: Loss=0.0492, Acc=0.9820
Epoch 4: Loss=0.0327, Acc=0.9891


In [39]:
torch.save(tm.state_dict(), "/kaggle/working/textmodel.pt")

In [40]:
vm2 = VisionModel().to(device)
vm2.load_state_dict(
    torch.load("/kaggle/working/visionmodel.pt", map_location=device)
)
vm2.eval()

tm2 = TextModel().to(device)
tm2.load_state_dict(
    torch.load("/kaggle/working/textmodel.pt", map_location=device)
)
tm2.eval()

TextModel(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [47]:
import torch.nn.functional as F

EMOTIONS = ["angry", "fear", "happy", "neutral", "sad"]


TEXT_TO_GLOBAL = {
    0: 0,  # anger → angry
    1: 1,  # fear  → fear
    2: 2   # joy   → happy
}

def fuse_vision_text(vision_logits, text_logits, alpha=0.6):
    """
    vision_logits: (B, 5)
    text_logits:   (B, 3)
    """

    vision_prob = F.softmax(vision_logits, dim=-1)
    text_prob   = F.softmax(text_logits, dim=-1)

    B = vision_prob.size(0)
    text_global = torch.zeros(B, 5, device=vision_prob.device)

    for t_idx, g_idx in TEXT_TO_GLOBAL.items():
        text_global[:, g_idx] = text_prob[:, t_idx]

    fused = alpha * vision_prob + (1 - alpha) * text_global
    emotion_id = torch.argmax(fused, dim=-1)

    return emotion_id


In [50]:
# Example inputs
text = "I feel very sad today"
image = torch.randn(1, 1, 48, 48).to(device)  # replace with real image later

# Tokenize text
enc = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64
)

input_ids = enc["input_ids"].to(device)
attention_mask = enc["attention_mask"].to(device)

# Run models
with torch.no_grad():
    vision_logits = vm2(image)
    text_logits   = tm2(input_ids, attention_mask)

# Fuse
emotion_id = fuse_vision_text(vision_logits, text_logits)

emotion = EMOTIONS[emotion_id]
print("Predicted emotion:", emotion)


Predicted emotion: sad
