In [2]:
!pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [1]:
import torch
print(torch.__version__)
print(torch.version.cuda)


2.8.0+cu126
12.6


In [3]:
import torch
import torch_geometric
print(torch.__version__)
print(torch_geometric.__version__)

2.8.0+cu126
2.6.1


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import GATConv
from torch_geometric.nn import GCNConv
import numpy as np
import pandas as pd
import os

In [6]:
# ---------- Graph Attention for Landmarks ----------
class LandmarkGraphEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, heads=4):
        super().__init__()
        self.gcn = GCNConv(in_dim, hidden_dim)
        self.gat = GATConv(hidden_dim, out_dim, heads=heads, concat=False)

    def forward(self, x, edge_index):
        x = self.gcn(x, edge_index)       # GCN feature propagation
        x = F.relu(x)
        x = self.gat(x, edge_index)       # GAT learns attention across landmarks
        return x


# ---------- Audio Encoder ----------
class AudioEncoder(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.fc = nn.Linear(in_dim, out_dim)
    def forward(self, x):
        return F.relu(self.fc(x))  # [batch, out_dim]

# ---------- Cross Attention ----------
class CrossAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
    def forward(self, audio_emb, landmark_embs):
        # audio_emb: [batch, d_model]
        # landmark_embs: [batch, num_nodes, d_model]
        Q = self.query(audio_emb).unsqueeze(1)  # [batch, 1, d_model]
        K = self.key(landmark_embs)             # [batch, num_nodes, d_model]
        V = self.value(landmark_embs)           # [batch, num_nodes, d_model]

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (K.size(-1) ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)   # [batch, 1, num_nodes]
        out = torch.matmul(attn_weights, V)             # [batch, 1, d_model]
        return out.squeeze(1)  # [batch, d_model]

In [7]:
# ---------- Final Model ----------
class AudioVisualEmotionModel(nn.Module):
    def __init__(self, node_feat_dim, audio_feat_dim, hidden_dim, num_classes):
        super().__init__()
        self.landmark_gat = LandmarkGraphEncoder(node_feat_dim, hidden_dim, hidden_dim)
        self.audio_encoder = AudioEncoder(audio_feat_dim, hidden_dim)
        self.cross_attention = CrossAttention(hidden_dim)
        self.bilstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, landmark_feats, adj, audio_feats):
        """
        landmark_feats: [batch, num_frames, num_nodes, node_feat_dim]
        adj: [num_nodes, num_nodes]
        audio_feats: [batch, num_frames, audio_feat_dim]
        """
        batch, num_frames, num_nodes, _ = landmark_feats.size()

        frame_embeddings = []
        for t in range(num_frames):
            lf = landmark_feats[:, t]          # [batch, num_nodes, node_feat_dim]
            af = audio_feats[:, t]             # [batch, audio_feat_dim]

            lf = self.landmark_gat(lf, adj)    # [batch, num_nodes, hidden_dim]
            af = self.audio_encoder(af)        # [batch, hidden_dim]

            fused = self.cross_attention(af, lf)  # [batch, hidden_dim]
            frame_embeddings.append(fused)

        frame_embeddings = torch.stack(frame_embeddings, dim=1)  # [batch, num_frames, hidden_dim]

        lstm_out, _ = self.bilstm(frame_embeddings)  # [batch, num_frames, hidden_dim*2]
        pooled = torch.mean(lstm_out, dim=1)        # [batch, hidden_dim*2]

        out = self.fc(pooled)  # [batch, num_classes]
        return out

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# from torch_geometric.data import Data, Batch  # PyG for graphs
# import pickle
# import os

# # ---------------------------
# # 1. Dataset
# # ---------------------------
# class RAVDESSDataset(Dataset):
#     def __init__(self, data_dir, labels):
#         """
#         mfcc_dir: folder containing per-video MFCC tensors
#         landmark_dir: folder containing pickled landmark frames per video
#         labels: list of integers (emotion classes)
#         """
#         self.data_dir = data_dir
#         self.transform = transform
#         self.labels = pd.read_csv(labels_csv)

#         # --- Map emotions to numeric labels ---
#         self.emotion_to_id = {
#             'neutral': 0,
#             'calm': 1,
#             'happy': 2,
#             'sad': 3,
#             'angry': 4,
#             'fearful': 5,
#             'disgust': 6,
#             'surprised': 7
#         }

#         # Apply numeric mapping
#         self.labels["emotion_label"] = self.labels["emotion_label"].map(self.emotion_to_id)

#     def __len__(self):
#         return len(self.mfcc_files)


#     def get_anatomical_edge_list():
#       """
#       Creates edge connections for 68 facial landmarks following anatomical structure.
#       Returns list of tuples (source, destination) representing edges.
#       """
#       edge_index = []

#       # right iris circumference (8 points: 0-7) - sequential connections
#       for i in range(7):
#           edge_index.append((i, i + 1))
#       edge_index.append((7, 0))

#       # right eye boundary (12 points: 8-19) - sequential connections
#       for i in range(8, 19):
#           edge_index.append((i, i + 1))
#       edge_index.append((19, 8))

#       # right pupil circumference (8 points: 20-27) - sequential connections
#       for i in range(20, 27):
#           edge_index.append((i, i + 1))
#       edge_index.append((27, 20))

#       # left iris circumference (8 points: 28-35) - sequential connections
#       for i in range(28, 35):
#           edge_index.append((i, i + 1))
#       edge_index.append((35, 28))

#       # left eye boundary (12 points: 36-47) - form a closed loop
#       for i in range(36, 47):
#           edge_index.append((i, i + 1))
#       edge_index.append((41, 36))  # Close the loop

#       # Left pupil circumference (8 points: 48-55) - form a closed loop
#       for i in range(48, 55):
#           edge_index.append((i, i + 1))
#       edge_index.append((55, 48))  # Close the loop

#       # jawline (17 points: 56-72) - form a closed loop
#       for i in range(56, 72):
#           edge_index.append((i, i + 1))
#       edge_index.append((72, 56))  # Close the loop

#       # right eyebrow (5 points: 73-77) - form a closed loop
#       for i in range(73, 77):
#           edge_index.append((i, i + 1))
#       edge_index.append((77, 73))  # Close the loop

#       # left eyebrow (points: 78-82)
#       for i in range(78, 82):
#           edge_index.append((i, i + 1))
#       edge_index.append((82, 78))  # Close the loop

#       # nose bridge(83-86) + lower nose (87-91)
#       for i in range(83, 91):
#           edge_index.append((i, i + 1))
#       edge_index.append((91, 83))  # Close the loop

#       # right eye boundary 68 dlib marks(92-97)
#       for i in range(92, 97):
#           edge_index.append((i, i + 1))
#       edge_index.append((97, 92))  # Close the loop

#       # left eye boundary 68 dlib marks(98-103)
#       for i in range(98, 103):
#           edge_index.append((i, i + 1))
#       edge_index.append((103, 98))  # Close the loop

#       # inner mouth (104-115)
#       for i in range(104, 115):
#           edge_index.append((i, i + 1))
#       edge_index.append((115, 104))  # Close the loop

#       # outer mouth (116-123)
#       for i in range(116, 123):
#           edge_index.append((i, i + 1))
#       edge_index.append((123, 116))

#       #----------------YET TO DECIDE----------------
#       # Inter-regional connections (connect major facial regions)
#       # edge_index.append((30, 48))  # Nose tip to mouth center
#       # edge_index.append((27, 39))  # Nose bridge to right eye
#       # edge_index.append((27, 42))  # Nose bridge to left eye
#       # edge_index.append((21, 39))  # Right eyebrow to right eye
#       # edge_index.append((22, 42))  # Left eyebrow to left eye
#       # ---------------- INTER-REGIONAL CONNECTIONS ----------------

#       # Nose bridge to eyes
#       edge_index += [
#           (83, 92),  # upper nose bridge to right eye inner corner
#           (83, 98),  # upper nose bridge to left eye inner corner
#       ]

#       # Nose tip to mouth region
#       edge_index += [
#           (91, 110),  # nose tip to upper lip center
#           (91, 120),  # nose tip to lower lip center
#       ]

#       # Eyebrows to eyes
#       edge_index += [
#           (73, 92),  # right eyebrow to right eye outer corner
#           (77, 97),  # right eyebrow inner end to right eye inner corner
#           (78, 98),  # left eyebrow inner end to left eye inner corner
#           (82, 103), # left eyebrow outer end to left eye outer corner
#       ]

#       # Jawline to mouth corners
#       edge_index += [
#           (56, 116),  # right jaw to right mouth corner
#           (72, 123),  # left jaw to left mouth corner
#       ]

#       # Cross connections for symmetry (eyes ↔ opposite eyebrows)
#       edge_index += [
#           (92, 78),   # right eye to left eyebrow
#           (98, 73),   # left eye to right eyebrow
#       ]

#       # Optional central connections for stability
#       edge_index += [
#           (83, 91),   # along the nose (bridge to tip)
#           (110, 120), # upper to lower lip
#       ]


#       return edge_index

# # # Test the function
# # anatomical_edges = get_anatomical_edge_list()
# # print(f"Total number of edges: {len(anatomical_edges)}")
# # print("First 10 edges:", anatomical_edges[:10])
# # print("Last 5 edges:", anatomical_edges[-5:])

# # # Verify all indices are within valid range (0-67)
# # max_index = max(max(edge) for edge in anatomical_edges)
# # min_index = min(min(edge) for edge in anatomical_edges)
# # print(f"Index range: {min_index} to {max_index} (should be 0 to 67)")


#     def __getitem__(self, idx):
#         file_path = os.path.join(self.data_dir, self.files[idx])               # [num_frames, mfcc_dim]

#         # Load landmark frames (list of [num_nodes, 2])
#         with open(file_path, "rb") as f:
#             video_data = pickle.load(f)                 # list of frames

#         # Convert to PyG Data objects
#         frame_graphs,mfcc_frames = [],[]
#         edge_index = get_anatomical_edge_list()  # your edge connection logic here
#         edge_index = torch.tensor(edge_index, dtype=torch.long)
#         for frame in video_data:
#             landmarks = np.array(frame['landmarks'], dtype=np.float32)
#             mfcc = np.array(frame['mfcc'], dtype=np.float32)

#             x = torch.tensor(landmarks, dtype=torch.float)
#             # edge_index = get_anatomical_edge_list()  # your edge connection logic here
#             # edge_index = torch.tensor(edge_index, dtype=torch.long)

#             data = Data(x=x, edge_index=edge_index)
#             frame_graphs.append(data)
#             mfcc_frames.append(mfcc)

#         mfcc_tensor = torch.tensor(np.stack(mfcc_frames), dtype=torch.float)  # [num_frames, 40]
#         label = torch.tensor(self.emotion_to_idx[self.emotions[idx]], dtype=torch.long)
#         return mfcc_tensor, frame_graphs, label

# # ---------------------------
# # 2. Training / Validation loops
# # ---------------------------
# def train_epoch(model, loader, criterion, optimizer, device):
#     model.train()
#     total_loss = 0.0
#     for mfcc_batch, landmark_batch_list, labels in loader:
#         labels = labels.to(device)
#         # mfcc_batch: list of tensors with shape [num_frames, mfcc_dim]
#         # landmark_batch_list: list of lists of PyG Data objects per video

#         batch_size = len(mfcc_batch)
#         num_frames = mfcc_batch[0].size(0)

#         # Stack MFCC into [batch, num_frames, mfcc_dim]
#         mfcc_batch = torch.stack([torch.tensor(f, dtype=torch.float) for f in mfcc_batch]).to(device)

#         # Forward pass
#         outputs = []
#         for i in range(batch_size):
#             video_landmarks = landmark_batch_list[i]       # list of Data objects
#             adj = None                                     # using edge_index inside PyG Data
#             video_output = model(mfcc_batch[i].unsqueeze(0), adj, video_landmarks)
#             outputs.append(video_output)

#         outputs = torch.cat(outputs, dim=0)
#         loss = criterion(outputs, labels)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item() * batch_size

#     return total_loss / len(loader.dataset)

# def validate_epoch(model, loader, criterion, device):
#     model.eval()
#     total_loss = 0.0
#     correct = 0
#     with torch.no_grad():
#         for mfcc_batch, landmark_batch_list, labels in loader:
#             labels = labels.to(device)
#             batch_size = len(mfcc_batch)
#             num_frames = mfcc_batch[0].size(0)

#             mfcc_batch = torch.stack([torch.tensor(f, dtype=torch.float) for f in mfcc_batch]).to(device)

#             outputs = []
#             for i in range(batch_size):
#                 video_landmarks = landmark_batch_list[i]
#                 adj = None
#                 video_output = model(mfcc_batch[i].unsqueeze(0), adj, video_landmarks)
#                 outputs.append(video_output)

#             outputs = torch.cat(outputs, dim=0)
#             total_loss += criterion(outputs, labels).item() * batch_size
#             _, predicted = torch.max(outputs, 1)
#             correct += (predicted == labels).sum().item()

#     accuracy = correct / len(loader.dataset)
#     return total_loss / len(loader.dataset), accuracy

# # ---------------------------
# # 3. Main training script
# # ---------------------------
# def main():
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     # Paths to your preprocessed MFCC and landmark data
#     data_dir = "/content/drive/MyDrive/RAVDESS/Landmarks_WithAudio"
#     labels_df = pd.read_csv("/content/drive/MyDrive/RAVDESS/emotion_labels.csv")

#     # Labels: list of integers per video
#     labels = [...]  # fill in your labels

#     # Dataset / Dataloader
#     dataset = RAVDESSDataset(mfcc_dir, landmark_dir, labels)
#     train_size = int(0.8 * len(dataset))
#     val_size = len(dataset) - train_size
#     train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

#     train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

#     # Model
#     from your_model_file import AudioVisualEmotionModel, LandmarkGraphEncoder, AudioEncoder, CrossAttention
#     model = AudioVisualEmotionModel(node_feat_dim=2, audio_feat_dim=40, hidden_dim=128, num_classes=8)
#     model.to(device)

#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=1e-3)
#     num_epochs = 50
#     best_val_acc = 0.0

#     for epoch in range(num_epochs):
#         train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
#         val_loss, val_acc = validate_epoch(model, val_loader, criterion, device)
#         print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f} "
#               f"| Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

#         if val_acc > best_val_acc:
#             best_val_acc = val_acc
#             torch.save(model.state_dict(), "best_model.pth")

# if __name__ == "__main__":
#     main()


In [None]:
# import os
# import pickle
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt


# import torch
# from torch.utils.data import Dataset, DataLoader
# from torch_geometric.data import Data

# # ---------------------------
# # 1. Dataset
# # ---------------------------
# class RAVDESSDataset(Dataset):
#     def __init__(self, data_dir, labels_csv):
#         """
#         data_dir: folder containing per-video pickled data with 'landmarks' and 'mfcc'
#         labels_csv: CSV file mapping filename -> emotion_label
#         """
#         self.data_dir = data_dir
#         self.labels_df = pd.read_csv(labels_csv)
#         self.files = sorted(os.listdir(data_dir))

#         # Map emotion to numeric labels
#         self.emotion_to_id = {
#             'neutral': 0,
#             'calm': 1,
#             'happy': 2,
#             'sad': 3,
#             'angry': 4,
#             'fearful': 5,
#             'disgust': 6,
#             'surprised': 7
#         }

#         self.labels_df["emotion_label"] = self.labels_df["emotion_label"].map(self.emotion_to_id)
#         self.filename_to_label = dict(zip(self.labels_df["filename"], self.labels_df["emotion_label"]))

#         # Precompute fixed anatomical edge index
#         self.edge_index = torch.tensor(self.get_anatomical_edge_list(), dtype=torch.long)

#     def __len__(self):
#         return len(self.files)

#     def get_anatomical_edge_list(self):
#         edge_index = []

#         # Right iris
#         for i in range(7): edge_index.append((i, i + 1))
#         edge_index.append((7, 0))

#         # Right eye boundary
#         for i in range(8, 19): edge_index.append((i, i + 1))
#         edge_index.append((19, 8))

#         # Right pupil
#         for i in range(20, 27): edge_index.append((i, i + 1))
#         edge_index.append((27, 20))

#         # Left iris
#         for i in range(28, 35): edge_index.append((i, i + 1))
#         edge_index.append((35, 28))

#         # Left eye boundary
#         for i in range(36, 47): edge_index.append((i, i + 1))
#         edge_index.append((41, 36))

#         # Left pupil
#         for i in range(48, 55): edge_index.append((i, i + 1))
#         edge_index.append((55, 48))

#         # Jawline
#         for i in range(56, 72): edge_index.append((i, i + 1))
#         edge_index.append((72, 56))

#         # Right eyebrow
#         for i in range(73, 77): edge_index.append((i, i + 1))
#         edge_index.append((77, 73))

#         # Left eyebrow
#         for i in range(78, 82): edge_index.append((i, i + 1))
#         edge_index.append((82, 78))

#         # Nose bridge + lower nose
#         for i in range(83, 91): edge_index.append((i, i + 1))
#         edge_index.append((91, 83))

#         # Inner mouth
#         for i in range(104, 115): edge_index.append((i, i + 1))
#         edge_index.append((115, 104))

#         # Outer mouth
#         for i in range(116, 123): edge_index.append((i, i + 1))
#         edge_index.append((123, 116))

#         # Additional anatomical connections
#         edge_index += [(83, 92),(83, 98),(91, 110),(91, 120),(73, 92),(77, 97),(78, 98),(82, 103),
#                        (56, 116),(72, 123),(92, 78),(98, 73),(83, 91),(110, 120)]
#         return edge_index

#     def __getitem__(self, idx):
#         filename = self.files[idx]
#         file_path = os.path.join(self.data_dir, filename)

#         with open(file_path, "rb") as f:
#             video_data = pickle.load(f)

#         frame_graphs = []
#         mfcc_frames = []

#         for frame in video_data:
#             landmarks = np.array(frame['landmarks'], dtype=np.float32)
#             # --- Center landmarks per frame ---
#             landmarks -= landmarks.mean(axis=0)

#             mfcc = np.array(frame['mfcc'], dtype=np.float32)
#             mfcc_frames.append(mfcc)

#             x = torch.tensor(landmarks, dtype=torch.float)
#             data = Data(x=x, edge_index=self.edge_index)
#             frame_graphs.append(data)

#         # --- Normalize MFCC per video ---
#         mfcc_tensor = torch.tensor(np.stack(mfcc_frames), dtype=torch.float)
#         mfcc_tensor = (mfcc_tensor - mfcc_tensor.mean(axis=0)) / (mfcc_tensor.std(axis=0) + 1e-6)

#         label = torch.tensor(self.filename_to_label[filename], dtype=torch.long)
#         return mfcc_tensor, frame_graphs, label

# # ---------------------------
# # 2. Stratified split helper
# # ---------------------------
# def stratified_split(labels_csv, test_size=0.2, random_state=42):
#     df = pd.read_csv(labels_csv)
#     train_files, val_files = train_test_split(
#         df['filename'],
#         test_size=test_size,
#         stratify=df['emotion_label'],
#         random_state=random_state
#     )
#     return train_files.tolist(), val_files.tolist()

# # ---------------------------
# # 3. Training / Validation loops (unchanged)
# # ---------------------------
# def train_epoch(model, loader, criterion, optimizer, device):
#     model.train()
#     total_loss = 0.0
#     for mfcc_batch, landmark_batch_list, labels in loader:
#         labels = labels.to(device)
#         batch_size = len(mfcc_batch)
#         mfcc_batch = torch.stack([torch.tensor(f, dtype=torch.float) for f in mfcc_batch]).to(device)

#         outputs = []
#         for i in range(batch_size):
#             video_landmarks = landmark_batch_list[i]
#             adj = None
#             video_output = model(mfcc_batch[i].unsqueeze(0), adj, video_landmarks)
#             outputs.append(video_output)

#         outputs = torch.cat(outputs, dim=0)
#         loss = criterion(outputs, labels)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item() * batch_size

#     return total_loss / len(loader.dataset)

# def validate_epoch(model, loader, criterion, device):
#     model.eval()
#     total_loss = 0.0
#     all_labels = []
#     all_preds = []

#     with torch.no_grad():
#         for mfcc_batch, landmark_batch_list, labels in loader:
#             labels = labels.to(device)
#             batch_size = len(mfcc_batch)

#             mfcc_batch = torch.stack([torch.tensor(f, dtype=torch.float) for f in mfcc_batch]).to(device)

#             outputs = []
#             for i in range(batch_size):
#                 video_landmarks = landmark_batch_list[i]
#                 adj = None
#                 video_output = model(mfcc_batch[i].unsqueeze(0), adj, video_landmarks)
#                 outputs.append(video_output)

#             outputs = torch.cat(outputs, dim=0)
#             total_loss += criterion(outputs, labels).item() * batch_size
#             _, predicted = torch.max(outputs, 1)

#             all_labels.extend(labels.cpu().numpy())
#             all_preds.extend(predicted.cpu().numpy())

#     accuracy = sum([p==t for p,t in zip(all_preds, all_labels)]) / len(all_labels)
#     return total_loss / len(loader.dataset), accuracy, all_labels, all_preds


# # ---------------------------
# # 4. Main training script
# # ---------------------------
# def main():
#     import torch.optim as optim
#     from your_model_file import AudioVisualEmotionModel  # replace with your actual import

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     data_dir = "/content/drive/MyDrive/RAVDESS/Landmarks_WithAudio"
#     labels_csv = "/content/drive/MyDrive/RAVDESS/emotion_labels.csv"

#     # Stratified split
#     train_files, val_files = stratified_split(labels_csv)

#     # Custom dataset filtering by split
#     full_dataset = RAVDESSDataset(data_dir, labels_csv)
#     train_dataset = torch.utils.data.Subset(full_dataset, [full_dataset.files.index(f) for f in train_files])
#     val_dataset = torch.utils.data.Subset(full_dataset, [full_dataset.files.index(f) for f in val_files])

#     train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

#     model = AudioVisualEmotionModel(node_feat_dim=2, audio_feat_dim=40, hidden_dim=128, num_classes=8)
#     model.to(device)

#     criterion = torch.nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=1e-3)
#     num_epochs = 50
#     best_val_acc = 0.0

#     for epoch in range(num_epochs):
#         train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
#         val_loss, val_acc = validate_epoch(model, val_loader, criterion, device)
#         print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
#         # During your main loop after validation:
#         val_loss, val_acc, true_labels, pred_labels = validate_epoch(model, val_loader, criterion, device)
#         print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.4f}")

#         # Confusion matrix
#         cm = confusion_matrix(true_labels, pred_labels)
#         disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(dataset.emotion_to_id.keys()))
#         disp.plot(cmap=plt.cm.Blues)
#         plt.show()


#         if val_acc > best_val_acc:
#             best_val_acc = val_acc
#             torch.save(model.state_dict(), "best_model.pth")

# if __name__ == "__main__":
#     main()


In [8]:
# # ---------------------------
# # Imports
# # ---------------------------
# import os
# import pickle
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader, Subset
# from torch_geometric.data import Data
# from torch_geometric.nn import GCNConv, GATConv
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

# # ---------------------------
# # 1. Dataset
# # ---------------------------
# class RAVDESSDataset(Dataset):
#     def __init__(self, data_dir, labels_csv):
#         self.data_dir = data_dir
#         self.labels_df = pd.read_csv(labels_csv)
#         self.files = sorted(os.listdir(data_dir))

#         # Map emotions to numeric labels
#         self.emotion_to_id = {
#             'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
#             'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
#         }
#         self.labels_df["emotion_label"] = self.labels_df["emotion_label"].map(self.emotion_to_id)
#         self.filename_to_label = dict(zip(self.labels_df["filename"], self.labels_df["emotion_label"]))

#         # Precompute anatomical edge index
#         self.edge_index = torch.tensor(self.get_anatomical_edge_list(), dtype=torch.long)

#     def __len__(self):
#         return len(self.files)

#     def get_anatomical_edge_list(self):
#         edge_index = []
#         # Right iris
#         for i in range(7): edge_index.append((i, i + 1))
#         edge_index.append((7, 0))
#         # Right eye boundary
#         for i in range(8, 19): edge_index.append((i, i + 1))
#         edge_index.append((19, 8))
#         # Right pupil
#         for i in range(20, 27): edge_index.append((i, i + 1))
#         edge_index.append((27, 20))
#         # Left iris
#         for i in range(28, 35): edge_index.append((i, i + 1))
#         edge_index.append((35, 28))
#         # Left eye boundary
#         for i in range(36, 47): edge_index.append((i, i + 1))
#         edge_index.append((41, 36))
#         # Left pupil
#         for i in range(48, 55): edge_index.append((i, i + 1))
#         edge_index.append((55, 48))
#         # Jawline
#         for i in range(56, 72): edge_index.append((i, i + 1))
#         edge_index.append((72, 56))
#         # Right eyebrow
#         for i in range(73, 77): edge_index.append((i, i + 1))
#         edge_index.append((77, 73))
#         # Left eyebrow
#         for i in range(78, 82): edge_index.append((i, i + 1))
#         edge_index.append((82, 78))
#         # Nose bridge + lower nose
#         for i in range(83, 91): edge_index.append((i, i + 1))
#         edge_index.append((91, 83))
#         # Inner mouth
#         for i in range(104, 115): edge_index.append((i, i + 1))
#         edge_index.append((115, 104))
#         # Outer mouth
#         for i in range(116, 123): edge_index.append((i, i + 1))
#         edge_index.append((123, 116))
#         # Additional anatomical connections
#         edge_index += [(83, 92),(83, 98),(91, 110),(91, 120),(73, 92),(77, 97),
#                        (78, 98),(82, 103),(56, 116),(72, 123),(92, 78),(98, 73),(83, 91),(110, 120)]
#         return edge_index

#     def __getitem__(self, idx):
#         filename = self.files[idx]
#         file_path = os.path.join(self.data_dir, filename)

#         with open(file_path, "rb") as f:
#             video_data = pickle.load(f)

#         frame_graphs = []
#         mfcc_frames = []

#         for frame in video_data:
#             landmarks = np.array(frame['landmarks'], dtype=np.float32)
#             landmarks -= landmarks.mean(axis=0)  # center landmarks
#             mfcc = np.array(frame['mfcc'], dtype=np.float32)
#             mfcc_frames.append(mfcc)

#             x = torch.tensor(landmarks, dtype=torch.float)
#             frame_graphs.append(Data(x=x, edge_index=self.edge_index))

#         # Convert MFCC to tensor
#         mfcc_tensor = torch.tensor(np.stack(mfcc_frames), dtype=torch.float)
#         # Normalize per video
#         mfcc_tensor = (mfcc_tensor - mfcc_tensor.mean(axis=0)) / (mfcc_tensor.std(axis=0) + 1e-6)

#         label = torch.tensor(self.filename_to_label[filename], dtype=torch.long)
#         seq_len = mfcc_tensor.size(0)
#         return mfcc_tensor, frame_graphs, label, seq_len

# # ---------------------------
# # Collate function to handle variable-length sequences
# # ---------------------------
# def collate_fn(batch):
#     mfccs, graphs, labels, lengths = zip(*batch)
#     lengths = torch.tensor(lengths, dtype=torch.long)
#     labels = torch.tensor(labels, dtype=torch.long)
#     return mfccs, graphs, labels, lengths

# # ---------------------------
# # 2. Model
# # ---------------------------
# class LandmarkGraphEncoder(nn.Module):
#     def __init__(self, in_dim, hidden_dim, out_dim, heads=4):
#         super().__init__()
#         self.gcn = GCNConv(in_dim, hidden_dim)
#         self.gat = GATConv(hidden_dim, out_dim, heads=heads, concat=False)
#     def forward(self, x, edge_index):
#         x = self.gcn(x, edge_index)
#         x = F.relu(x)
#         x = self.gat(x, edge_index)
#         return x

# class AudioEncoder(nn.Module):
#     def __init__(self, in_dim, out_dim):
#         super().__init__()
#         self.fc = nn.Linear(in_dim, out_dim)
#     def forward(self, x):
#         return F.relu(self.fc(x))

# class CrossAttention(nn.Module):
#     def __init__(self, d_model):
#         super().__init__()
#         self.query = nn.Linear(d_model, d_model)
#         self.key = nn.Linear(d_model, d_model)
#         self.value = nn.Linear(d_model, d_model)
#     def forward(self, audio_emb, landmark_embs):
#         Q = self.query(audio_emb).unsqueeze(1)
#         K = self.key(landmark_embs)
#         V = self.value(landmark_embs)
#         attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (K.size(-1) ** 0.5)
#         attn_weights = F.softmax(attn_scores, dim=-1)
#         out = torch.matmul(attn_weights, V)
#         return out.squeeze(1)

# class AudioVisualEmotionModel(nn.Module):
#     def __init__(self, node_feat_dim, audio_feat_dim, hidden_dim, num_classes):
#         super().__init__()
#         self.landmark_gat = LandmarkGraphEncoder(node_feat_dim, hidden_dim, hidden_dim)
#         self.audio_encoder = AudioEncoder(audio_feat_dim, hidden_dim)
#         self.cross_attention = CrossAttention(hidden_dim)
#         self.bilstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_dim*2, num_classes)

#     def forward(self, mfcc_batch, adj, frame_graphs_list, lengths):
#         batch_size = len(mfcc_batch)
#         device = mfcc_batch[0].device
#         hidden_dim = self.audio_encoder.fc.out_features

#         frame_embeddings = []
#         max_seq_len = max(lengths).item()

#         for i in range(batch_size):
#             frames = frame_graphs_list[i]
#             seq_len = len(frames)
#             video_emb = []
#             for t in range(seq_len):
#                 lf = frames[t].x.to(device)
#                 af = mfcc_batch[i][t].unsqueeze(0)
#                 lf_emb = self.landmark_gat(lf, adj)
#                 af_emb = self.audio_encoder(af)
#                 fused = self.cross_attention(af_emb, lf_emb)
#                 video_emb.append(fused)
#             video_emb = torch.cat(video_emb, dim=0)  # [seq_len, hidden_dim]
#             # pad to max_seq_len
#             if seq_len < max_seq_len:
#                 pad = torch.zeros(max_seq_len - seq_len, hidden_dim, device=device)
#                 video_emb = torch.cat([video_emb, pad], dim=0)
#             frame_embeddings.append(video_emb)
#         frame_embeddings = torch.stack(frame_embeddings, dim=0)  # [batch, max_seq_len, hidden_dim]

#         # Pack sequence for LSTM
#         packed = pack_padded_sequence(frame_embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False)
#         lstm_out, _ = self.bilstm(packed)
#         lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

#         # Mean pooling over valid frames
#         outputs = []
#         for i, seq_len in enumerate(lengths):
#             pooled = lstm_out[i, :seq_len, :].mean(dim=0)
#             outputs.append(pooled)
#         outputs = torch.stack(outputs, dim=0)
#         out = self.fc(outputs)
#         return out

# # ---------------------------
# # 3. Stratified split helper
# # ---------------------------
# def stratified_split(labels_csv, test_size=0.2, random_state=42):
#     df = pd.read_csv(labels_csv)
#     train_files, val_files = train_test_split(
#         df['filename'], test_size=test_size, stratify=df['emotion_label'], random_state=random_state
#     )
#     return train_files.tolist(), val_files.tolist()

# # ---------------------------
# # 4. Training / Validation
# # ---------------------------
# def train_epoch(model, loader, criterion, optimizer, device):
#     model.train()
#     total_loss = 0.0
#     for mfccs, frame_graphs_list, labels, lengths in loader:
#         labels = labels.to(device)
#         mfccs = [f.to(device) for f in mfccs]
#         adj = frame_graphs_list[0][0].edge_index.to(device)

#         optimizer.zero_grad()
#         outputs = model(mfccs, adj, frame_graphs_list, lengths)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item() * len(labels)
#     return total_loss / len(loader.dataset)

# def validate_epoch(model, loader, criterion, device):
#     model.eval()
#     total_loss = 0.0
#     all_labels, all_preds = [], []
#     with torch.no_grad():
#         for mfccs, frame_graphs_list, labels, lengths in loader:
#             labels = labels.to(device)
#             mfccs = [f.to(device) for f in mfccs]
#             adj = frame_graphs_list[0][0].edge_index.to(device)
#             outputs = model(mfccs, adj, frame_graphs_list, lengths)
#             loss = criterion(outputs, labels)
#             total_loss += loss.item() * len(labels)
#             preds = torch.argmax(outputs, dim=1)
#             all_labels.extend(labels.cpu().numpy())
#             all_preds.extend(preds.cpu().numpy())
#     acc = sum([p==t for p,t in zip(all_preds, all_labels)]) / len(all_labels)
#     return total_loss / len(loader.dataset), acc, all_labels, all_preds

# # ---------------------------
# # 5. Main training script
# # ---------------------------
# def main():
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     data_dir = "/content/drive/MyDrive/RAVDESS/Landmarks_WithAudio"
#     labels_csv = "/content/drive/MyDrive/RAVDESS/emotion_labels.csv"

#     # Stratified split
#     train_files, val_files = stratified_split(labels_csv)

#     # Dataset
#     full_dataset = RAVDESSDataset(data_dir, labels_csv)
#     train_idx = [full_dataset.files.index(f) for f in train_files]
#     val_idx = [full_dataset.files.index(f) for f in val_files]

#     train_dataset = Subset(full_dataset, train_idx)
#     val_dataset = Subset(full_dataset, val_idx)

#     train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
#     val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

#     # Model
#     model = AudioVisualEmotionModel(node_feat_dim=2, audio_feat_dim=40, hidden_dim=128, num_classes=8)
#     model.to(device)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

#     best_val_acc = 0.0
#     num_epochs = 50

#     for epoch in range(num_epochs):
#         train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
#         val_loss, val_acc, _, _ = validate_epoch(model, val_loader, criterion, device)
#         print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

#         if val_acc > best_val_acc:
#             best_val_acc = val_acc
#             torch.save(model.state_dict(), "best_model.pth")

#     # Final confusion matrix after training
#     _, _, true_labels, pred_labels = validate_epoch(model, val_loader, criterion, device)
#     cm = confusion_matrix(true_labels, pred_labels)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(full_dataset.emotion_to_id.keys()))
#     disp.plot(cmap=plt.cm.Blues)
#     plt.show()

# if __name__ == "__main__":
#     main()


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 126 but got size 2 for tensor number 1 in the list.

In [None]:
# # ---------------------------
# # Imports
# # ---------------------------
# import os
# import pickle
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader, Subset
# from torch_geometric.data import Data
# from torch_geometric.nn import GCNConv, GATConv
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# # ---------------------------
# # 1. Dataset
# # ---------------------------
# class RAVDESSDataset(Dataset):
#     def __init__(self, data_dir, labels_csv):
#         self.data_dir = data_dir
#         self.labels_df = pd.read_csv(labels_csv)
#         self.files = sorted(os.listdir(data_dir))

#         # Map emotions to numeric labels
#         self.emotion_to_id = {
#             'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
#             'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
#         }
#         self.labels_df["emotion_label"] = self.labels_df["emotion_label"].map(self.emotion_to_id)
#         self.filename_to_label = dict(zip(self.labels_df["filename"], self.labels_df["emotion_label"]))

#         # Precompute anatomical edge index as [2, num_edges] tensor
#         self.edge_index = torch.tensor(self.get_anatomical_edge_list(), dtype=torch.long).t().contiguous()

#     def __len__(self):
#         return len(self.files)

#     def get_anatomical_edge_list(self):
#         edges = []
#         # Right iris
#         for i in range(7): edges.append((i, i+1))
#         edges.append((7, 0))
#         # Right eye boundary
#         for i in range(8, 19): edges.append((i, i+1))
#         edges.append((19, 8))
#         # Right pupil
#         for i in range(20, 27): edges.append((i, i+1))
#         edges.append((27, 20))
#         # Left iris
#         for i in range(28, 35): edges.append((i, i+1))
#         edges.append((35, 28))
#         # Left eye boundary
#         for i in range(36, 47): edges.append((i, i+1))
#         edges.append((41, 36))
#         # Left pupil
#         for i in range(48, 55): edges.append((i, i+1))
#         edges.append((55, 48))
#         # Jawline
#         for i in range(56, 72): edges.append((i, i+1))
#         edges.append((72, 56))
#         # Right eyebrow
#         for i in range(73, 77): edges.append((i, i+1))
#         edges.append((77, 73))
#         # Left eyebrow
#         for i in range(78, 82): edges.append((i, i+1))
#         edges.append((82, 78))
#         # Nose bridge + lower nose
#         for i in range(83, 91): edges.append((i, i+1))
#         edges.append((91, 83))
#         # Inner mouth
#         for i in range(104, 115): edges.append((i, i+1))
#         edges.append((115, 104))
#         # Outer mouth
#         for i in range(116, 123): edges.append((i, i+1))
#         edges.append((123, 116))
#         # Additional anatomical connections
#         edges += [(83,92),(83,98),(91,110),(91,120),(73,92),(77,97),
#                   (78,98),(82,103),(56,116),(72,123),(92,78),(98,73),(83,91),(110,120)]
#         return edges

#     def __getitem__(self, idx):
#         filename = self.files[idx]
#         file_path = os.path.join(self.data_dir, filename)

#         with open(file_path, "rb") as f:
#             video_data = pickle.load(f)

#         frame_graphs = []
#         mfcc_frames = []

#         for frame in video_data:
#             landmarks = np.array(frame['landmarks'], dtype=np.float32)
#             landmarks -= landmarks.mean(axis=0)  # center landmarks
#             mfcc = np.array(frame['mfcc'], dtype=np.float32)
#             mfcc_frames.append(mfcc)

#             x = torch.tensor(landmarks, dtype=torch.float)
#             frame_graphs.append(Data(x=x, edge_index=self.edge_index))

#         mfcc_tensor = torch.tensor(np.stack(mfcc_frames), dtype=torch.float)
#         # Normalize per video
#         mfcc_tensor = (mfcc_tensor - mfcc_tensor.mean(axis=0)) / (mfcc_tensor.std(axis=0) + 1e-6)

#         label = torch.tensor(self.filename_to_label[filename], dtype=torch.long)
#         seq_len = mfcc_tensor.size(0)
#         return mfcc_tensor, frame_graphs, label, seq_len

# # Collate function to handle variable-length sequences
# def collate_fn(batch):
#     mfccs, graphs, labels, lengths = zip(*batch)
#     lengths = torch.tensor(lengths, dtype=torch.long)
#     labels = torch.tensor(labels, dtype=torch.long)
#     return mfccs, graphs, labels, lengths

# # ---------------------------
# # 2. Model
# # ---------------------------
# class LandmarkGraphEncoder(nn.Module):
#     def __init__(self, in_dim, hidden_dim, out_dim, heads=4):
#         super().__init__()
#         self.gcn = GCNConv(in_dim, hidden_dim)
#         self.gat = GATConv(hidden_dim, out_dim, heads=heads, concat=False)
#     def forward(self, x, edge_index):
#         x = self.gcn(x, edge_index)
#         x = F.relu(x)
#         x = self.gat(x, edge_index)
#         return x

# class AudioEncoder(nn.Module):
#     def __init__(self, in_dim, out_dim):
#         super().__init__()
#         self.fc = nn.Linear(in_dim, out_dim)
#     def forward(self, x):
#         return F.relu(self.fc(x))

# class CrossAttention(nn.Module):
#     def __init__(self, d_model):
#         super().__init__()
#         self.query = nn.Linear(d_model, d_model)
#         self.key = nn.Linear(d_model, d_model)
#         self.value = nn.Linear(d_model, d_model)
#     def forward(self, audio_emb, landmark_embs):
#         Q = self.query(audio_emb).unsqueeze(1)          # [batch,1,d_model]
#         K = self.key(landmark_embs)                     # [num_nodes,d_model]
#         V = self.value(landmark_embs)
#         attn_scores = torch.matmul(Q, K.transpose(-2,-1)) / (K.size(-1)**0.5)
#         attn_weights = F.softmax(attn_scores, dim=-1)
#         out = torch.matmul(attn_weights, V)
#         return out.squeeze(1)

# class AudioVisualEmotionModel(nn.Module):
#     def __init__(self, node_feat_dim, audio_feat_dim, hidden_dim, num_classes):
#         super().__init__()
#         self.landmark_gat = LandmarkGraphEncoder(node_feat_dim, hidden_dim, hidden_dim)
#         self.audio_encoder = AudioEncoder(audio_feat_dim, hidden_dim)
#         self.cross_attention = CrossAttention(hidden_dim)
#         self.bilstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_dim*2, num_classes)

#     def forward(self, mfcc_batch, adj, frame_graphs_list, lengths):
#         batch_size = len(mfcc_batch)
#         device = mfcc_batch[0].device
#         hidden_dim = self.audio_encoder.fc.out_features
#         frame_embeddings = []
#         max_seq_len = max(lengths).item()

#         for i in range(batch_size):
#             frames = frame_graphs_list[i]
#             seq_len = len(frames)
#             video_emb = []
#             for t in range(seq_len):
#                 lf = frames[t].x.to(device)
#                 af = mfcc_batch[i][t].unsqueeze(0).to(device)
#                 lf_emb = self.landmark_gat(lf, adj)
#                 af_emb = self.audio_encoder(af)
#                 fused = self.cross_attention(af_emb, lf_emb)
#                 video_emb.append(fused)
#             video_emb = torch.cat(video_emb, dim=0)  # [seq_len, hidden_dim]
#             if seq_len < max_seq_len:                 # pad
#                 pad = torch.zeros(max_seq_len - seq_len, hidden_dim, device=device)
#                 video_emb = torch.cat([video_emb, pad], dim=0)
#             frame_embeddings.append(video_emb)

#         frame_embeddings = torch.stack(frame_embeddings, dim=0)  # [batch,max_seq_len,hidden_dim]
#         packed = pack_padded_sequence(frame_embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False)
#         lstm_out, _ = self.bilstm(packed)
#         lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

#         outputs = []
#         for i, seq_len in enumerate(lengths):
#             pooled = lstm_out[i, :seq_len, :].mean(dim=0)
#             outputs.append(pooled)
#         outputs = torch.stack(outputs, dim=0)
#         out = self.fc(outputs)
#         return out

# # ---------------------------
# # 3. Stratified split
# # ---------------------------
# def stratified_split(labels_csv, test_size=0.2, random_state=42):
#     df = pd.read_csv(labels_csv)
#     train_files, val_files = train_test_split(
#         df['filename'], test_size=test_size, stratify=df['emotion_label'], random_state=random_state
#     )
#     return train_files.tolist(), val_files.tolist()

# # ---------------------------
# # 4. Train / Validate
# # ---------------------------
# def train_epoch(model, loader, criterion, optimizer, device):
#     model.train()
#     total_loss = 0.0
#     for mfccs, frame_graphs_list, labels, lengths in loader:
#         labels = labels.to(device)
#         mfccs = [f.to(device) for f in mfccs]
#         adj = frame_graphs_list[0][0].edge_index.to(device)

#         optimizer.zero_grad()
#         outputs = model(mfccs, adj, frame_graphs_list, lengths)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item() * len(labels)
#     return total_loss / len(loader.dataset)

# def validate_epoch(model, loader, criterion, device):
#     model.eval()
#     total_loss = 0.0
#     all_labels, all_preds = [], []
#     with torch.no_grad():
#         for mfccs, frame_graphs_list, labels, lengths in loader:
#             labels = labels.to(device)
#             mfccs = [f.to(device) for f in mfccs]
#             adj = frame_graphs_list[0][0].edge_index.to(device)
#             outputs = model(mfccs, adj, frame_graphs_list, lengths)
#             loss = criterion(outputs, labels)
#             total_loss += loss.item() * len(labels)
#             preds = torch.argmax(outputs, dim=1)
#             all_labels.extend(labels.cpu().numpy())
#             all_preds.extend(preds.cpu().numpy())
#     acc = sum([p==t for p,t in zip(all_preds, all_labels)]) / len(all_labels)
#     return total_loss / len(loader.dataset), acc, all_labels, all_preds

# # ---------------------------
# # 5. Main
# # ---------------------------
# def main():
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     data_dir = "/content/drive/MyDrive/RAVDESS/Landmarks_WithAudio"
#     labels_csv = "/content/drive/MyDrive/RAVDESS/emotion_labels.csv"

#     train_files, val_files = stratified_split(labels_csv)
#     full_dataset = RAVDESSDataset(data_dir, labels_csv)

#     train_idx = [full_dataset.files.index(f) for f in train_files]
#     val_idx = [full_dataset.files.index(f) for f in val_files]

#     train_dataset = Subset(full_dataset, train_idx)
#     val_dataset = Subset(full_dataset, val_idx)

#     train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
#     val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

#     model = AudioVisualEmotionModel(node_feat_dim=2, audio_feat_dim=40, hidden_dim=128, num_classes=8)
#     model.to(device)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

#     best_val_acc = 0.0
#     num_epochs = 50

#     for epoch in range(num_epochs):
#         train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
#         val_loss, val_acc, _, _ = validate_epoch(model, val_loader, criterion, device)
#         print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
#         if val_acc > best_val_acc:
#             best_val_acc = val_acc
#             torch.save(model.state_dict(), "best_model.pth")

#     # Final confusion matrix
#     _, _, true_labels, pred_labels = validate_epoch(model, val_loader, criterion, device)
#     cm = confusion_matrix(true_labels, pred_labels)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(full_dataset.emotion_to_id.keys()))
#     disp.plot(cmap=plt.cm.Blues)
#     plt.show()

# if __name__ == "__main__":
#     main()


Epoch [1/50] Train Loss: 2.0609 | Val Loss: 2.0403 | Val Acc: 0.1527
Epoch [2/50] Train Loss: 2.0505 | Val Loss: 2.0528 | Val Acc: 0.1527
Epoch [3/50] Train Loss: 2.0940 | Val Loss: 2.0514 | Val Acc: 0.1527
Epoch [4/50] Train Loss: 2.0755 | Val Loss: 2.0533 | Val Acc: 0.1527


In [None]:
# ==========================================
# FULL AUDIO-VISUAL EMOTION RECOGNITION SCRIPT
# ==========================================

# ---------------------------
# Imports
# ---------------------------
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# ---------------------------
# 1. Dataset
# ---------------------------
class RAVDESSDataset(Dataset):
    def __init__(self, data_dir, labels_csv):
        self.data_dir = data_dir
        self.labels_df = pd.read_csv(labels_csv)
        self.files = sorted(os.listdir(data_dir))

        # Map emotions to numeric labels
        self.emotion_to_id = {
            'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3,
            'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7
        }
        self.labels_df["emotion_label"] = self.labels_df["emotion_label"].map(self.emotion_to_id)
        self.filename_to_label = dict(zip(self.labels_df["filename"], self.labels_df["emotion_label"]))

        # Precompute anatomical edge index
        self.edge_index = torch.tensor(self.get_anatomical_edge_list(), dtype=torch.long).t().contiguous()

    def __len__(self):
        return len(self.files)

    def get_anatomical_edge_list(self):
        edge_index = []
        # Right iris
        for i in range(7): edge_index.append((i, i + 1))
        edge_index.append((7, 0))
        # Right eye boundary
        for i in range(8, 19): edge_index.append((i, i + 1))
        edge_index.append((19, 8))
        # Right pupil
        for i in range(20, 27): edge_index.append((i, i + 1))
        edge_index.append((27, 20))
        # Left iris
        for i in range(28, 35): edge_index.append((i, i + 1))
        edge_index.append((35, 28))
        # Left eye boundary
        for i in range(36, 47): edge_index.append((i, i + 1))
        edge_index.append((41, 36))
        # Left pupil
        for i in range(48, 55): edge_index.append((i, i + 1))
        edge_index.append((55, 48))
        # Jawline
        for i in range(56, 72): edge_index.append((i, i + 1))
        edge_index.append((72, 56))
        # Right eyebrow
        for i in range(73, 77): edge_index.append((i, i + 1))
        edge_index.append((77, 73))
        # Left eyebrow
        for i in range(78, 82): edge_index.append((i, i + 1))
        edge_index.append((82, 78))
        # Nose bridge + lower nose
        for i in range(83, 91): edge_index.append((i, i + 1))
        edge_index.append((91, 83))
        # Inner mouth
        for i in range(104, 115): edge_index.append((i, i + 1))
        edge_index.append((115, 104))
        # Outer mouth
        for i in range(116, 123): edge_index.append((i, i + 1))
        edge_index.append((123, 116))
        # Additional anatomical connections
        edge_index += [(83, 92),(83, 98),(91, 110),(91, 120),(73, 92),(77, 97),
                       (78, 98),(82, 103),(56, 116),(72, 123),(92, 78),(98, 73),(83, 91),(110, 120)]
        return edge_index

    def __getitem__(self, idx):
        filename = self.files[idx]
        file_path = os.path.join(self.data_dir, filename)

        with open(file_path, "rb") as f:
            video_data = pickle.load(f)

        frame_graphs = []
        mfcc_frames = []

        for frame in video_data:
            landmarks = np.array(frame['landmarks'], dtype=np.float32)
            landmarks -= landmarks.mean(axis=0)  # center landmarks
            mfcc = np.array(frame['mfcc'], dtype=np.float32)
            mfcc_frames.append(mfcc)

            x = torch.tensor(landmarks, dtype=torch.float)
            frame_graphs.append(Data(x=x, edge_index=self.edge_index))

        # Convert MFCC to tensor
        mfcc_tensor = torch.tensor(np.stack(mfcc_frames), dtype=torch.float)
        # Normalize per video
        mfcc_tensor = (mfcc_tensor - mfcc_tensor.mean(axis=0)) / (mfcc_tensor.std(axis=0) + 1e-6)

        label = torch.tensor(self.filename_to_label[filename], dtype=torch.long)
        seq_len = mfcc_tensor.size(0)
        return mfcc_tensor, frame_graphs, label, seq_len

# ---------------------------
# Collate function
# ---------------------------
def collate_fn(batch):
    mfccs, graphs, labels, lengths = zip(*batch)
    lengths = torch.tensor(lengths, dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    return mfccs, graphs, labels, lengths

# ---------------------------
# 2. Model
# ---------------------------
class LandmarkGraphEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, heads=4):
        super().__init__()
        self.gcn = GCNConv(in_dim, hidden_dim)
        self.gat = GATConv(hidden_dim, out_dim, heads=heads, concat=False)
    def forward(self, x, edge_index):
        x = self.gcn(x, edge_index)
        x = F.relu(x)
        x = self.gat(x, edge_index)
        return x

class AudioEncoder(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.fc = nn.Linear(in_dim, out_dim)
    def forward(self, x):
        return F.relu(self.fc(x))

class CrossAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
    def forward(self, audio_emb, landmark_embs):
        Q = self.query(audio_emb).unsqueeze(1)
        K = self.key(landmark_embs)
        V = self.value(landmark_embs)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (K.size(-1) ** 0.5)
        attn_weights = F.softmax(attn_scores, dim=-1)
        out = torch.matmul(attn_weights, V)
        return out.squeeze(1)

class TemporalAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(hidden_dim*2, 1)
    def forward(self, lstm_out, lengths):
        attn_scores = self.attn_fc(lstm_out).squeeze(-1)
        mask = torch.arange(lstm_out.size(1))[None, :].to(lengths.device) < lengths[:, None]
        attn_scores[~mask] = float('-inf')
        attn_weights = torch.softmax(attn_scores, dim=1).unsqueeze(-1)
        weighted_sum = (lstm_out * attn_weights).sum(dim=1)
        return weighted_sum

class AudioVisualEmotionModel(nn.Module):
    def __init__(self, node_feat_dim, audio_feat_dim, hidden_dim, num_classes):
        super().__init__()
        self.landmark_gat = LandmarkGraphEncoder(node_feat_dim, hidden_dim, hidden_dim)
        self.audio_encoder = AudioEncoder(audio_feat_dim, hidden_dim)
        self.cross_attention = CrossAttention(hidden_dim)
        self.bilstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.temporal_attn = TemporalAttention(hidden_dim)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, mfcc_batch, adj, frame_graphs_list, lengths):
        batch_size = len(mfcc_batch)
        device = mfcc_batch[0].device
        hidden_dim = self.audio_encoder.fc.out_features

        frame_embeddings = []
        max_seq_len = max(lengths).item()

        for i in range(batch_size):
            frames = frame_graphs_list[i]
            seq_len = len(frames)
            video_emb = []
            for t in range(seq_len):
                lf = frames[t].x.to(device)
                af = mfcc_batch[i][t].unsqueeze(0)
                lf_emb = self.landmark_gat(lf, adj)
                af_emb = self.audio_encoder(af)
                fused = self.cross_attention(af_emb, lf_emb)
                video_emb.append(fused)
            video_emb = torch.cat(video_emb, dim=0)
            if seq_len < max_seq_len:
                pad = torch.zeros(max_seq_len - seq_len, hidden_dim, device=device)
                video_emb = torch.cat([video_emb, pad], dim=0)
            frame_embeddings.append(video_emb)

        frame_embeddings = torch.stack(frame_embeddings, dim=0)
        packed = pack_padded_sequence(frame_embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.bilstm(packed)
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        video_embeds = self.temporal_attn(lstm_out, lengths)
        out = self.fc(video_embeds)
        return out

# ---------------------------
# 3. Train / Validation Functions
# ---------------------------
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for mfccs, frame_graphs_list, labels, lengths in loader:
        labels = labels.to(device)
        mfccs = [f.to(device) for f in mfccs]
        adj = frame_graphs_list[0][0].edge_index.to(device)
        optimizer.zero_grad()
        outputs = model(mfccs, adj, frame_graphs_list, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(labels)
    return total_loss / len(loader.dataset)

def validate_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_labels, all_preds = [], []
    with torch.no_grad():
        for mfccs, frame_graphs_list, labels, lengths in loader:
            labels = labels.to(device)
            mfccs = [f.to(device) for f in mfccs]
            adj = frame_graphs_list[0][0].edge_index.to(device)
            outputs = model(mfccs, adj, frame_graphs_list, lengths)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * len(labels)
            preds = torch.argmax(outputs, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    acc = sum([p==t for p,t in zip(all_preds, all_labels)]) / len(all_labels)
    return total_loss / len(loader.dataset), acc, all_labels, all_preds

# ---------------------------
# 4. Main Script
# ---------------------------
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_dir = "/content/drive/MyDrive/RAVDESS/Landmarks_WithAudio"
    labels_csv = "/content/drive/MyDrive/RAVDESS/emotion_labels.csv"

    full_dataset = RAVDESSDataset(data_dir, labels_csv)

    train_files, val_files = train_test_split(
        full_dataset.files, test_size=0.2, stratify=full_dataset.labels_df["emotion_label"], random_state=42
    )
    train_idx = [full_dataset.files.index(f) for f in train_files]
    val_idx = [full_dataset.files.index(f) for f in val_files]

    train_dataset = Subset(full_dataset, train_idx)
    val_dataset = Subset(full_dataset, val_idx)

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

    model = AudioVisualEmotionModel(node_feat_dim=2, audio_feat_dim=40, hidden_dim=64, num_classes=8)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    best_val_acc = 0.0
    num_epochs = 50

    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc, _, _ = validate_epoch(model, val_loader, criterion, device)
        print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")

    # Confusion matrix
    _, _, true_labels, pred_labels = validate_epoch(model, val_loader, criterion, device)
    cm = confusion_matrix(true_labels, pred_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(full_dataset.emotion_to_id.keys()))
    disp.plot(cmap=plt.cm.Blues)
    plt.show()

if __name__ == "__main__":
    main()


Epoch [1/50] Train Loss: 2.0257 | Val Loss: 1.9474 | Val Acc: 0.1894
Epoch [2/50] Train Loss: 1.8880 | Val Loss: 1.8066 | Val Acc: 0.2668
Epoch [3/50] Train Loss: 1.7406 | Val Loss: 1.7360 | Val Acc: 0.3340
Epoch [4/50] Train Loss: 1.6448 | Val Loss: 1.7291 | Val Acc: 0.3116
Epoch [5/50] Train Loss: 1.5798 | Val Loss: 1.6017 | Val Acc: 0.3910
Epoch [6/50] Train Loss: 1.5097 | Val Loss: 1.5991 | Val Acc: 0.3666
Epoch [7/50] Train Loss: 1.4622 | Val Loss: 1.5353 | Val Acc: 0.4134
Epoch [8/50] Train Loss: 1.4189 | Val Loss: 1.5047 | Val Acc: 0.4257
Epoch [9/50] Train Loss: 1.3642 | Val Loss: 1.5249 | Val Acc: 0.4033
Epoch [10/50] Train Loss: 1.3355 | Val Loss: 1.5001 | Val Acc: 0.4358
Epoch [11/50] Train Loss: 1.3185 | Val Loss: 1.4544 | Val Acc: 0.4134
Epoch [12/50] Train Loss: 1.2650 | Val Loss: 1.4728 | Val Acc: 0.4297
Epoch [13/50] Train Loss: 1.2455 | Val Loss: 1.4758 | Val Acc: 0.4358
