In [2]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import torch
import cv2
import pandas as pd
import numpy as np
from transformers import VideoMAEFeatureExtractor, VideoMAEModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pickle
import multiprocessing

# Set multiprocessing start method to 'spawn'
multiprocessing.set_start_method('spawn', force=True)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained VideoMAE model and the feature extractor
fine_tuned_video_mae = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")

# Load your fine-tuned model weights
fine_tuned_video_mae.load_state_dict(torch.load(
    '/data/home/huixian/Documents/Homeworks/535_project/mosei_code/best_classifier_CE_final_0.5796.pth'),
    strict=False)

fine_tuned_video_mae.eval()

class FeatureSizeMutator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FeatureSizeMutator, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

# ---- Updated Dataset class: no split filtering ----
class VideoDataset(Dataset):
    def __init__(self, csv_path, clip_dir, feature_extractor, target_dim=512, transform=False):
        self.csv_data = pd.read_csv(csv_path)
        self.clip_dir = clip_dir
        self.feature_extractor = feature_extractor
        self.target_dim = target_dim
        self.transform = transform
        self.size_mutator = FeatureSizeMutator(input_dim=768, output_dim=self.target_dim).to(device)

    def __len__(self):
        return len(self.csv_data)

    def __getitem__(self, idx):
        row = self.csv_data.iloc[idx]
        clip_filename = row['clip_filename_y']
        sentiment_label = row['sentiment_label']
        clip_path = os.path.join(self.clip_dir, clip_filename)
        video_id = row['video_id']
        clip_features = self.extract_video_features(clip_path)
        return clip_features, sentiment_label, video_id

    def extract_video_features(self, video_path, clip_len=16):
        cap = cv2.VideoCapture(video_path)
        frames = []
        clip_features = []

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = frame[:, :, ::-1]
            if self.transform:
                frame = cv2.flip(frame, 1)
            frames.append(frame)
        cap.release()

        for i in range(0, len(frames), clip_len):
            clip = frames[i:i + clip_len]
            if len(clip) < clip_len:
                clip += [clip[-1]] * (clip_len - len(clip))
            inputs = self.feature_extractor(images=clip, return_tensors="pt").to(device)
            with torch.no_grad():
                feature_vector = fine_tuned_video_mae(**inputs).last_hidden_state.mean(dim=1)
            clip_features.append(feature_vector)

        if len(clip_features) > 0:
            aggregated_features = torch.stack(clip_features).mean(dim=0).squeeze()
        else:
            aggregated_features = torch.zeros(768).to(device)

        return self.size_mutator(aggregated_features)

# ---- Define paths ----
clip_dir = '/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Clip/Clips_16frames'
csv_path = '/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Labels/new_sentiment_split_2.csv'

# ---- Load full dataset (all splits) ----
full_dataset = VideoDataset(csv_path=csv_path, clip_dir=clip_dir, feature_extractor=feature_extractor, target_dim=1068)
full_loader = DataLoader(full_dataset, batch_size=32, shuffle=False, num_workers=0)

# ---- Collect features by video_id ----
features_by_video_id = {}

for batch_idx, (features, labels, video_ids) in enumerate(full_loader):
    for feature, label, video_id in zip(features, labels, video_ids):
        if video_id not in features_by_video_id:
            features_by_video_id[video_id] = []
        features_by_video_id[video_id].append(feature.detach().cpu().numpy())

# ---- Save to pickle ----
with open('video_features_by_video_id_all_splits.pkl', 'wb') as f:
    pickle.dump(features_by_video_id, f)

print(f"✅ Extracted features for {len(features_by_video_id)} unique video IDs across all splits.")


  fine_tuned_video_mae.load_state_dict(torch.load(


✅ Extracted features for 755 unique video IDs across all splits.


In [4]:
import pickle

# Load the pickle file
with open('pooled_video_features_by_video_id.pkl', 'rb') as f:
    features_by_video_id = pickle.load(f)

# Inspect one entry (for example, the first key)
for video_id, features in features_by_video_id.items():
    print(f"Video ID: {video_id}")
    print(f"Number of feature vectors: {len(features)}")
    print(f"Feature vector shape (first one): {features[0].shape}")
    print(f"First feature vector:\n{features[0]}")
      # Remove this if you want to inspect more entries


Video ID: 1tuMFJnlXJc
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.060404568910598755
Video ID: CZigX1ntOsI
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.0662238746881485
Video ID: G4HMKRdIva0
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.06494846194982529
Video ID: e7_2U4lm6TE
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.05840817466378212
Video ID: MOfEl_1dh-g
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.0736188068985939
Video ID: xgAwddSkrOo
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.06759286671876907
Video ID: tymso_pAxhk
Number of feature vectors: 1068
Feature vector shape (first one): ()
First feature vector:
-0.06665563583374023
Video ID: xUz5cCA-oTo
Number of feature vectors: 1068
Feature v

In [6]:
import pickle
import numpy as np

# Load the pickle file
with open('video_features_by_video_id_all_splits.pkl', 'rb') as f:
    features_by_video_id = pickle.load(f)

# Dictionary to store pooled features
pooled_features_by_video_id = {}

# Apply mean pooling across all vectors for each video_id
for video_id, feature_list in features_by_video_id.items():
    feature_array = np.stack(feature_list)  # shape: (num_clips, 1068)
    pooled_feature = feature_array.mean(axis=0)  # shape: (1068,)
    pooled_features_by_video_id[video_id] = pooled_feature

# Optionally save the pooled results to a new pickle file
with open('pooled_video_features_by_video_id.pkl', 'wb') as f:
    pickle.dump(pooled_features_by_video_id, f)

print("Pooled features stored by video_id.")


Pooled features stored by video_id.


In [7]:
import pickle

# Load the pooled features pickle file
with open('pooled_video_features_by_video_id.pkl', 'rb') as f:
    pooled_features_by_video_id = pickle.load(f)

# Count and display
num_videos = len(pooled_features_by_video_id)
print(f"Total number of unique video IDs: {num_videos}\n")

# Print feature vector for each video_id
for video_id, feature_vector in pooled_features_by_video_id.items():
    print(f"Video ID: {video_id}")
    print(f"Feature vector shape: {feature_vector.shape}")
    print(f"Feature vector (first 10 values): {feature_vector[:10]}\n")  # print first 10 values for brevity


Total number of unique video IDs: 755

Video ID: AmGocfFQfVE
Feature vector shape: (1068,)
Feature vector (first 10 values): [-0.28054422  0.23151323  0.41358086 -0.0352246  -0.25289178 -0.01532813
 -0.1506592  -0.1771845   0.19026688  0.48809192]

Video ID: gR4gM_WXesQ
Feature vector shape: (1068,)
Feature vector (first 10 values): [-0.1638539   0.22058095  0.39495203 -0.08049608 -0.23570003 -0.01421413
 -0.19233166 -0.10762765  0.12630367  0.496147  ]

Video ID: kZfcQ4a0kx4
Feature vector shape: (1068,)
Feature vector (first 10 values): [-0.14707781  0.20357898  0.43356997 -0.10799874 -0.27823618 -0.00389364
 -0.16126564 -0.10338495  0.09596101  0.4381565 ]

Video ID: c_FJuhSte8Q
Feature vector shape: (1068,)
Feature vector (first 10 values): [-0.21931212  0.22391966  0.431094   -0.05899051 -0.25376338 -0.05708244
 -0.18777046 -0.14045465  0.10794505  0.46477297]

Video ID: XzVapdEr_GY
Feature vector shape: (1068,)
Feature vector (first 10 values): [-0.13683228  0.17578666  0.3638914