In [1]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import random
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import VideoMAEFeatureExtractor, VideoMAEModel
from sklearn.metrics import f1_score, recall_score, accuracy_score
from tqdm import tqdm

# ---- SETTINGS ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


clip_dir = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/Clips_16frames"
mapping_csv = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/clip_sentiment_mapping.csv"

negative_samples = 1500
neutral_samples = 2000
positive_samples = 1500
batch_size = 16
clip_len = 16
num_epochs = 20

# ---- DATASET ----
class VideoClipDataset(Dataset):
    def __init__(self, clip_dir, csv_path, feature_extractor):
        self.clip_dir = clip_dir
        self.df = pd.read_csv(csv_path)
        self.feature_extractor = feature_extractor

        # Group by sentiment label
        grouped = self.df.groupby("sentiment_label")

        self.samples = []
        for label, n_samples in zip(["Negative", "Neutral", "Positive"], [negative_samples, neutral_samples, positive_samples]):
            group = grouped.get_group(label)
            if label == "Negative":
                sorted_group = group.sort_values("sentiment_score")
            elif label == "Neutral":
                sorted_group = group.reindex((group["sentiment_score"] - 0).abs().sort_values().index)
            else:  # Positive
                sorted_group = group.sort_values("sentiment_score", ascending=False)

            selected = sorted_group.head(n_samples)
            self.samples.extend(selected.itertuples(index=False))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        row = self.samples[idx]
        clip_path = os.path.join(self.clip_dir, row.clip_filename)

        cap = cv2.VideoCapture(clip_path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frames.append(frame[:, :, ::-1])  # BGR to RGB
        cap.release()

        if len(frames) < clip_len:
            frames += [frames[-1]] * (clip_len - len(frames))
        frames = frames[:clip_len]

        inputs = self.feature_extractor(images=frames, return_tensors="pt")["pixel_values"].squeeze(0)
        return inputs, torch.tensor(row.sentiment_score, dtype=torch.float32)

# ---- LOSS ----
class CenteredWeightedMSELoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, preds, targets):
        ideal = torch.zeros_like(targets)
        ideal[targets < -0.3] = -3.0
        ideal[targets > 0.3] = 3.0
        ideal[(-0.3 <= targets) & (targets <= 0.3)] = 0.0

        weights = torch.ones_like(targets)
        weights[targets < -0.3] = 2.0
        weights[targets > 0.3] = 2.0
        weights[(-0.3 <= targets) & (targets <= 0.3)] = 1.0

        mse = (preds - ideal) ** 2
        return (weights * mse).mean()

# ---- MODEL ----
class SentimentRegressor(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.regressor = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.regressor(x).squeeze(1)

# ---- TRAINING LOOP ----
def run_epoch(model, loader, optimizer, is_train=True):
    model.train() if is_train else model.eval()
    total_preds, total_labels = [], []
    total_loss = 0

    for clips, targets in tqdm(loader, leave=False):
        clips, targets = clips.to(device), targets.to(device)

        with torch.set_grad_enabled(is_train):
            features = video_mae(clips).last_hidden_state.mean(dim=1)
            preds = model(features)
            loss = loss_fn(preds, targets)

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        total_loss += loss.item()
        total_preds.extend(preds.detach().cpu().numpy())
        total_labels.extend(targets.detach().cpu().numpy())

    return total_loss / len(loader), np.array(total_preds), np.array(total_labels)

def evaluate(preds, labels):
    def to_label(x):
        return "Negative" if x < -0.3 else "Positive" if x > 0.3 else "Neutral"
    preds_label = [to_label(p) for p in preds]
    labels_label = [to_label(l) for l in labels]

    macro_f1 = f1_score(labels_label, preds_label, average="macro")
    micro_f1 = f1_score(labels_label, preds_label, average="micro")
    recall = recall_score(labels_label, preds_label, average=None, labels=["Negative", "Neutral", "Positive"])
    acc = accuracy_score(labels_label, preds_label)
    return macro_f1, micro_f1, recall, acc

# ---- MAIN ----
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
video_mae = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
video_mae.eval()
for param in video_mae.parameters():
    param.requires_grad = False

full_dataset = VideoClipDataset(clip_dir, mapping_csv, feature_extractor)
train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

regressor = SentimentRegressor(feature_dim=768).to(device)
loss_fn = CenteredWeightedMSELoss()
optimizer = optim.Adam(regressor.parameters(), lr=2e-4)

# ---- TRAIN ----
best_macro_f1 = -np.inf
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch}")
    train_loss, _, _ = run_epoch(regressor, train_loader, optimizer, is_train=True)
    val_loss, val_preds, val_labels = run_epoch(regressor, val_loader, optimizer, is_train=False)

    macro_f1, micro_f1, recall, acc = evaluate(val_preds, val_labels)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f} | Acc: {acc:.4f} | Recall: {recall}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(regressor.state_dict(), "best_regressor_neutral.pth")
        print(f"✅ Best model saved at epoch {epoch} with Macro-F1={macro_f1:.4f}")

# ---- EVALUATE TEST ----
test_loss, test_preds, test_labels = run_epoch(regressor, test_loader, optimizer, is_train=False)
macro_f1, micro_f1, recall, acc = evaluate(test_preds, test_labels)
print("\n----- TEST RESULTS -----")
print(f"Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f} | Acc: {acc:.4f} | Recall: {recall}")




Epoch 0


                                                 

Train Loss: 10.5385 | Val Loss: 10.5506
Macro-F1: 0.3732 | Micro-F1: 0.4400 | Acc: 0.4400 | Recall: [0.11564626 0.80208333 0.30434783]
✅ Best model saved at epoch 0 with Macro-F1=0.3732

Epoch 1


                                                 

Train Loss: 10.1457 | Val Loss: 10.1286
Macro-F1: 0.4506 | Micro-F1: 0.4560 | Acc: 0.4560 | Recall: [0.46938776 0.546875   0.33540373]
✅ Best model saved at epoch 1 with Macro-F1=0.4506

Epoch 2


                                                 

Train Loss: 9.8161 | Val Loss: 10.0076
Macro-F1: 0.3969 | Micro-F1: 0.4240 | Acc: 0.4240 | Recall: [0.24489796 0.25       0.79503106]

Epoch 3


                                                 

Train Loss: 9.5110 | Val Loss: 9.5394
Macro-F1: 0.4376 | Micro-F1: 0.4360 | Acc: 0.4360 | Recall: [0.51020408 0.30729167 0.52173913]

Epoch 4


                                                 

Train Loss: 9.3182 | Val Loss: 9.3753
Macro-F1: 0.4405 | Micro-F1: 0.4420 | Acc: 0.4420 | Recall: [0.49659864 0.28645833 0.57763975]

Epoch 5


                                                 

Train Loss: 9.1759 | Val Loss: 9.3849
Macro-F1: 0.4638 | Micro-F1: 0.4640 | Acc: 0.4640 | Recall: [0.58503401 0.33854167 0.50310559]
✅ Best model saved at epoch 5 with Macro-F1=0.4638

Epoch 6


                                                 

Train Loss: 9.0053 | Val Loss: 9.2183
Macro-F1: 0.4055 | Micro-F1: 0.4220 | Acc: 0.4220 | Recall: [0.42176871 0.17708333 0.71428571]

Epoch 7


                                                 

Train Loss: 8.8487 | Val Loss: 9.0413
Macro-F1: 0.4413 | Micro-F1: 0.4480 | Acc: 0.4480 | Recall: [0.51020408 0.25       0.62732919]

Epoch 8


                                                 

Train Loss: 8.7282 | Val Loss: 9.0182
Macro-F1: 0.4838 | Micro-F1: 0.4860 | Acc: 0.4860 | Recall: [0.57142857 0.328125   0.59627329]
✅ Best model saved at epoch 8 with Macro-F1=0.4838

Epoch 9


                                                 

Train Loss: 8.5595 | Val Loss: 9.7730
Macro-F1: 0.4448 | Micro-F1: 0.4500 | Acc: 0.4500 | Recall: [0.7414966  0.28125    0.38509317]

Epoch 10


                                                 

Train Loss: 8.5015 | Val Loss: 8.8003
Macro-F1: 0.4459 | Micro-F1: 0.4560 | Acc: 0.4560 | Recall: [0.53741497 0.22916667 0.65217391]

Epoch 11


                                                 

Train Loss: 8.3798 | Val Loss: 8.8044
Macro-F1: 0.4696 | Micro-F1: 0.4760 | Acc: 0.4760 | Recall: [0.56462585 0.27604167 0.63354037]

Epoch 12


                                                 

Train Loss: 8.2130 | Val Loss: 8.9078
Macro-F1: 0.4610 | Micro-F1: 0.4640 | Acc: 0.4640 | Recall: [0.63945578 0.27083333 0.53416149]

Epoch 13


                                                 

Train Loss: 8.1520 | Val Loss: 8.5906
Macro-F1: 0.4632 | Micro-F1: 0.4740 | Acc: 0.4740 | Recall: [0.55782313 0.24479167 0.67080745]

Epoch 14


                                                 

Train Loss: 8.0617 | Val Loss: 8.6513
Macro-F1: 0.4392 | Micro-F1: 0.4600 | Acc: 0.4600 | Recall: [0.44897959 0.19791667 0.7826087 ]

Epoch 15


                                                 

Train Loss: 7.9157 | Val Loss: 8.5190
Macro-F1: 0.4504 | Micro-F1: 0.4640 | Acc: 0.4640 | Recall: [0.49659864 0.22395833 0.72049689]

Epoch 16


                                                 

Train Loss: 7.8848 | Val Loss: 8.6215
Macro-F1: 0.4384 | Micro-F1: 0.4600 | Acc: 0.4600 | Recall: [0.42857143 0.19791667 0.80124224]

Epoch 17


                                                 

Train Loss: 7.7665 | Val Loss: 8.3983
Macro-F1: 0.4728 | Micro-F1: 0.4860 | Acc: 0.4860 | Recall: [0.5170068  0.25       0.73913043]

Epoch 18


                                                 

Train Loss: 7.6442 | Val Loss: 8.3860
Macro-F1: 0.4675 | Micro-F1: 0.4780 | Acc: 0.4780 | Recall: [0.58503401 0.23958333 0.66459627]

Epoch 19


                                                 

Train Loss: 7.5461 | Val Loss: 8.3209
Macro-F1: 0.4649 | Micro-F1: 0.4760 | Acc: 0.4760 | Recall: [0.59183673 0.234375   0.65838509]


                                               


----- TEST RESULTS -----
Macro-F1: 0.4629 | Micro-F1: 0.4740 | Acc: 0.4740 | Recall: [0.58       0.22751323 0.66459627]


