In [10]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.io import read_video
from tqdm import tqdm
from sklearn.metrics import f1_score, recall_score, accuracy_score
from transformers import VideoMAEFeatureExtractor, VideoMAEModel

In [11]:
# --- Configuration ---
clip_dir = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/Clips_16frames"
mapping_csv = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/clip_sentiment_mapping.csv"
save_model_path = "./best_model_2.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

✅ Using device: cuda


In [12]:

# --- Sampling Customization ---
num_positive = 1700
num_neutral = 1700
num_negative = 1700

train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

In [13]:
# --- Load and Sample Dataset ---
df = pd.read_csv(mapping_csv)

def classify_sentiment(score):
    if score < -0.3:
        return "Negative"
    elif score > 0.3:
        return "Positive"
    else:
        return "Neutral"

df["sentiment_label"] = df["sentiment_score"].apply(classify_sentiment)

samples = []
for label, n_sample in zip(["Positive", "Neutral", "Negative"], [num_positive, num_neutral, num_negative]):
    subset = df[df["sentiment_label"] == label]
    n_sample = min(n_sample, len(subset))
    samples.append(subset.sample(n=n_sample, random_state=42))

df_sampled = pd.concat(samples).reset_index(drop=True)

total_len = len(df_sampled)
train_len = int(total_len * train_ratio)
val_len = int(total_len * val_ratio)
test_len = total_len - train_len - val_len

df_shuffled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
df_train = df_shuffled.iloc[:train_len]
df_val = df_shuffled.iloc[train_len:train_len+val_len]
df_test = df_shuffled.iloc[train_len+val_len:]

In [14]:
# --- Dataset ---
class VideoClipDataset(Dataset):
    def __init__(self, dataframe, clip_dir, transform=None):
        self.dataframe = dataframe
        self.clip_dir = clip_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        clip_path = os.path.join(self.clip_dir, row["clip_filename"])
        video, _, _ = read_video(clip_path, pts_unit="sec")
        video = video.permute(0, 3, 1, 2)  # (T, C, H, W)
        video = video.float() / 255.0

        if self.transform:
            video = self.transform(video)

        sentiment_score = torch.tensor(row["sentiment_score"], dtype=torch.float32)

        return video, sentiment_score

In [15]:
# --- Transform ---
transform = transforms.Compose([
    transforms.Resize((224, 224)),
])

train_dataset = VideoClipDataset(df_train, clip_dir, transform=transform)
val_dataset = VideoClipDataset(df_val, clip_dir, transform=transform)
test_dataset = VideoClipDataset(df_test, clip_dir, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=True)

In [16]:
# --- Model ---
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
videomae = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
videomae.eval()

class SentimentRegressor(nn.Module):
    def __init__(self, feature_dim=768):
        super().__init__()
        self.fc = nn.Linear(feature_dim, 1)

    def forward(self, x):
        return self.fc(x)

regressor = SentimentRegressor().to(device)



In [17]:
# --- Optimizer and Loss ---
optimizer = optim.Adam(regressor.parameters(), lr=1e-4)
criterion = nn.MSELoss()

# --- Evaluation Metrics ---
def evaluate(preds, labels):
    preds_label = []
    labels_label = []

    for p in preds:
        if p < -0.3:
            preds_label.append("Negative")
        elif p > 0.3:
            preds_label.append("Positive")
        else:
            preds_label.append("Neutral")

    for l in labels:
        if l < -0.3:
            labels_label.append("Negative")
        elif l > 0.3:
            labels_label.append("Positive")
        else:
            labels_label.append("Neutral")

    macro_f1 = f1_score(labels_label, preds_label, average="macro")
    micro_f1 = f1_score(labels_label, preds_label, average="micro")
    recall = recall_score(labels_label, preds_label, average=None, labels=["Negative", "Neutral", "Positive"])
    acc = accuracy_score(labels_label, preds_label)

    return macro_f1, micro_f1, recall, acc


In [18]:
# --- Training Loop ---
best_macro_f1 = 0

for epoch in range(20):
    regressor.train()
    total_loss = 0

    for videos, scores in tqdm(train_loader, desc=f"Training Epoch {epoch}"):
        videos = videos.to(device, non_blocking=True)
        scores = scores.to(device, non_blocking=True)

        with torch.no_grad():
            features = videomae(videos).last_hidden_state[:, 0]  # CLS token

        preds = regressor(features).squeeze()

        loss = criterion(preds, scores)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    # Validation
    regressor.eval()
    preds_list = []
    labels_list = []

    with torch.no_grad():
        for videos, scores in tqdm(val_loader, desc=f"Validating Epoch {epoch}"):
            videos = videos.to(device, non_blocking=True)
            scores = scores.to(device, non_blocking=True)

            features = videomae(videos).last_hidden_state[:, 0]
            preds = regressor(features).squeeze()

            preds_list.extend(preds.cpu().numpy())
            labels_list.extend(scores.cpu().numpy())

    macro_f1, micro_f1, recall, acc = evaluate(preds_list, labels_list)

    print(f"Epoch {epoch}: Val Macro-F1={macro_f1:.4f}, Micro-F1={micro_f1:.4f}, Acc={acc:.4f}, Recall={recall}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(regressor.state_dict(), save_model_path)
        print(f"✅ Best model saved at epoch {epoch} with Macro-F1={macro_f1:.4f}")

print("✅ Training complete.")


Training Epoch 0: 100%|██████████| 447/447 [01:45<00:00,  4.25it/s]
Validating Epoch 0: 100%|██████████| 96/96 [00:22<00:00,  4.30it/s]


Epoch 0: Val Macro-F1=0.1974, Micro-F1=0.3333, Acc=0.3333, Recall=[0.02692308 0.94509804 0.028     ]
✅ Best model saved at epoch 0 with Macro-F1=0.1974


Training Epoch 1: 100%|██████████| 447/447 [01:33<00:00,  4.80it/s]
Validating Epoch 1: 100%|██████████| 96/96 [00:21<00:00,  4.37it/s]


Epoch 1: Val Macro-F1=0.2093, Micro-F1=0.3373, Acc=0.3373, Recall=[0.00769231 0.93333333 0.072     ]
✅ Best model saved at epoch 1 with Macro-F1=0.2093


Training Epoch 2: 100%|██████████| 447/447 [01:44<00:00,  4.29it/s]
Validating Epoch 2: 100%|██████████| 96/96 [00:22<00:00,  4.18it/s]


Epoch 2: Val Macro-F1=0.2599, Micro-F1=0.3608, Acc=0.3608, Recall=[0.04615385 0.91372549 0.124     ]
✅ Best model saved at epoch 2 with Macro-F1=0.2599


Training Epoch 3: 100%|██████████| 447/447 [01:55<00:00,  3.88it/s]
Validating Epoch 3: 100%|██████████| 96/96 [00:23<00:00,  4.13it/s]


Epoch 3: Val Macro-F1=0.2742, Micro-F1=0.3608, Acc=0.3608, Recall=[0.05384615 0.87058824 0.16      ]
✅ Best model saved at epoch 3 with Macro-F1=0.2742


Training Epoch 4: 100%|██████████| 447/447 [02:09<00:00,  3.45it/s]
Validating Epoch 4: 100%|██████████| 96/96 [00:29<00:00,  3.27it/s]


Epoch 4: Val Macro-F1=0.2865, Micro-F1=0.3621, Acc=0.3621, Recall=[0.05384615 0.82745098 0.208     ]
✅ Best model saved at epoch 4 with Macro-F1=0.2865


Training Epoch 5: 100%|██████████| 447/447 [02:09<00:00,  3.44it/s]
Validating Epoch 5: 100%|██████████| 96/96 [00:28<00:00,  3.38it/s]


Epoch 5: Val Macro-F1=0.2879, Micro-F1=0.3634, Acc=0.3634, Recall=[0.06538462 0.83921569 0.188     ]
✅ Best model saved at epoch 5 with Macro-F1=0.2879


Training Epoch 6: 100%|██████████| 447/447 [02:11<00:00,  3.41it/s]
Validating Epoch 6: 100%|██████████| 96/96 [00:29<00:00,  3.23it/s]


Epoch 6: Val Macro-F1=0.3017, Micro-F1=0.3791, Acc=0.3791, Recall=[0.13461538 0.89019608 0.112     ]
✅ Best model saved at epoch 6 with Macro-F1=0.3017


Training Epoch 7: 100%|██████████| 447/447 [02:00<00:00,  3.71it/s]
Validating Epoch 7: 100%|██████████| 96/96 [00:21<00:00,  4.37it/s]


Epoch 7: Val Macro-F1=0.3014, Micro-F1=0.3725, Acc=0.3725, Recall=[0.08461538 0.84705882 0.188     ]


Training Epoch 8: 100%|██████████| 447/447 [01:29<00:00,  5.01it/s]
Validating Epoch 8: 100%|██████████| 96/96 [00:19<00:00,  4.88it/s]


Epoch 8: Val Macro-F1=0.3107, Micro-F1=0.3712, Acc=0.3712, Recall=[0.06923077 0.77254902 0.276     ]
✅ Best model saved at epoch 8 with Macro-F1=0.3107


Training Epoch 9: 100%|██████████| 447/447 [02:05<00:00,  3.57it/s]
Validating Epoch 9: 100%|██████████| 96/96 [00:30<00:00,  3.10it/s]


Epoch 9: Val Macro-F1=0.3188, Micro-F1=0.3804, Acc=0.3804, Recall=[0.10769231 0.83137255 0.204     ]
✅ Best model saved at epoch 9 with Macro-F1=0.3188


Training Epoch 10: 100%|██████████| 447/447 [02:54<00:00,  2.57it/s]
Validating Epoch 10: 100%|██████████| 96/96 [00:38<00:00,  2.50it/s]


Epoch 10: Val Macro-F1=0.3322, Micro-F1=0.3882, Acc=0.3882, Recall=[0.11923077 0.82352941 0.224     ]
✅ Best model saved at epoch 10 with Macro-F1=0.3322


Training Epoch 11: 100%|██████████| 447/447 [02:53<00:00,  2.58it/s]
Validating Epoch 11: 100%|██████████| 96/96 [00:38<00:00,  2.50it/s]


Epoch 11: Val Macro-F1=0.3328, Micro-F1=0.3922, Acc=0.3922, Recall=[0.16923077 0.85490196 0.152     ]
✅ Best model saved at epoch 11 with Macro-F1=0.3328


Training Epoch 12: 100%|██████████| 447/447 [02:52<00:00,  2.59it/s]
Validating Epoch 12: 100%|██████████| 96/96 [00:38<00:00,  2.49it/s]


Epoch 12: Val Macro-F1=0.3432, Micro-F1=0.3935, Acc=0.3935, Recall=[0.13076923 0.80784314 0.244     ]
✅ Best model saved at epoch 12 with Macro-F1=0.3432


Training Epoch 13: 100%|██████████| 447/447 [02:52<00:00,  2.59it/s]
Validating Epoch 13: 100%|██████████| 96/96 [00:38<00:00,  2.50it/s]


Epoch 13: Val Macro-F1=0.3410, Micro-F1=0.3987, Acc=0.3987, Recall=[0.18846154 0.85882353 0.148     ]


Training Epoch 14: 100%|██████████| 447/447 [02:52<00:00,  2.59it/s]
Validating Epoch 14: 100%|██████████| 96/96 [00:38<00:00,  2.50it/s]


Epoch 14: Val Macro-F1=0.3527, Micro-F1=0.4039, Acc=0.4039, Recall=[0.17307692 0.84313725 0.196     ]
✅ Best model saved at epoch 14 with Macro-F1=0.3527


Training Epoch 15: 100%|██████████| 447/447 [02:51<00:00,  2.60it/s]
Validating Epoch 15: 100%|██████████| 96/96 [00:38<00:00,  2.51it/s]


Epoch 15: Val Macro-F1=0.3504, Micro-F1=0.3987, Acc=0.3987, Recall=[0.16153846 0.81960784 0.216     ]


Training Epoch 16: 100%|██████████| 447/447 [02:10<00:00,  3.42it/s]
Validating Epoch 16: 100%|██████████| 96/96 [00:29<00:00,  3.21it/s]


Epoch 16: Val Macro-F1=0.3605, Micro-F1=0.4065, Acc=0.4065, Recall=[0.19230769 0.82745098 0.2       ]
✅ Best model saved at epoch 16 with Macro-F1=0.3605


Training Epoch 17: 100%|██████████| 447/447 [02:23<00:00,  3.12it/s]
Validating Epoch 17: 100%|██████████| 96/96 [00:38<00:00,  2.51it/s]


Epoch 17: Val Macro-F1=0.3549, Micro-F1=0.4013, Acc=0.4013, Recall=[0.18846154 0.81960784 0.196     ]


Training Epoch 18: 100%|██████████| 447/447 [02:51<00:00,  2.61it/s]
Validating Epoch 18: 100%|██████████| 96/96 [00:38<00:00,  2.52it/s]


Epoch 18: Val Macro-F1=0.3537, Micro-F1=0.3974, Acc=0.3974, Recall=[0.17692308 0.8        0.216     ]


Training Epoch 19: 100%|██████████| 447/447 [02:51<00:00,  2.61it/s]
Validating Epoch 19: 100%|██████████| 96/96 [00:37<00:00,  2.54it/s]

Epoch 19: Val Macro-F1=0.3593, Micro-F1=0.3948, Acc=0.3948, Recall=[0.15384615 0.7372549  0.296     ]
✅ Training complete.



