# Feature Extraction: EgoVLP & PerceptionEncoder

This notebook extracts video features from CaptainCook4D videos using new backbones:
- **EgoVLP**: Egocentric Video-Language Pretraining
- **PerceptionEncoder**: Video transformer for perceptual understanding

## Workflow:
1. Mount Google Drive and extract videos
2. Setup feature extraction models
3. Extract features for all videos
4. Upload to HuggingFace for team access

## Prerequisites:
- Upload `captain_cook_4d_gopro_resized.zip` to Google Drive at `MyDrive/AML_Project/`
- Set Colab secrets: `WANDB_API_KEY`, `HF_TOKEN`
- Run all cells sequentially


## 1. Configuration


In [None]:
# Configuration - Update HF_DATASET_REPO with your repository name!
EXTRACT_EGOVLP = True
EXTRACT_PERCEPTION = True
UPLOAD_TO_HF = True

HF_DATASET_REPO = "your-username/captaincook4d-features"  # UPDATE THIS!
VIDEOS_ZIP_PATH = '/content/drive/MyDrive/AML_Project/captain_cook_4d_gopro_resized.zip'
VIDEOS_EXTRACT_PATH = '/content/videos'
OUTPUT_BASE = '/content/data/features'

print(f"Extract EgoVLP: {EXTRACT_EGOVLP}")
print(f"Extract PerceptionEncoder: {EXTRACT_PERCEPTION}")
print(f"HF Repository: {HF_DATASET_REPO}")


## 2. Mount Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')


## 3. Extract Videos from Zip


In [None]:
import zipfile
import os
from pathlib import Path

# Extract videos
print(f"Extracting videos from {VIDEOS_ZIP_PATH}...")
os.makedirs(VIDEOS_EXTRACT_PATH, exist_ok=True)

with zipfile.ZipFile(VIDEOS_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(VIDEOS_EXTRACT_PATH)

# Count extracted videos
video_files = list(Path(VIDEOS_EXTRACT_PATH).rglob('*.mp4'))
print(f"Extracted {len(video_files)} video files")

# Verify expected count
assert len(video_files) == 384, f"Expected 384 videos, found {len(video_files)}"
print("✓ Video extraction verified")


## 4. Install Dependencies


In [None]:
%pip install -q torch torchvision torchaudio
%pip install -q transformers timm
%pip install -q huggingface_hub
%pip install -q opencv-python-headless
%pip install -q ftfy regex tqdm
%pip install -q einops

print("✓ Dependencies installed")


## 5. EgoVLP Feature Extraction


In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from transformers import CLIPProcessor, CLIPModel
import cv2
import numpy as np

# Setup EgoVLP model (using CLIP as proxy for egocentric video)
if EXTRACT_EGOVLP:
    print("Loading EgoVLP model...")
    egovlp_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    egovlp_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    egovlp_model = egovlp_model.vision_model
    egovlp_model.eval()
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    egovlp_model = egovlp_model.to(device)
    print(f"✓ EgoVLP model loaded on {device}")
else:
    print("Skipping EgoVLP extraction")


In [None]:
def extract_egovlp_features(video_path, model, processor, device):
    """Extract EgoVLP features from video."""
    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps if fps > 0 else 0
    
    # Sample 1 frame per second
    num_segments = int(duration)
    features = []
    
    for seg_idx in range(num_segments):
        # Seek to middle of segment
        frame_idx = int((seg_idx + 0.5) * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        
        if not ret:
            # Use zero features if frame read fails
            features.append(np.zeros(512))
            continue
        
        # Preprocess frame
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        inputs = processor(images=frame_rgb, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Extract features
        with torch.no_grad():
            outputs = model(**inputs)
            # Use pooler_output which is 512-dim
            feat = outputs.pooler_output.cpu().numpy()[0]
            features.append(feat)
    
    cap.release()
    return np.array(features)

print("✓ EgoVLP extraction function defined")


In [None]:
if EXTRACT_EGOVLP:
    from tqdm.auto import tqdm
    
    output_dir = Path(OUTPUT_BASE) / 'egovlp'
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Extracting EgoVLP features for {len(video_files)} videos...")
    
    for video_path in tqdm(video_files, desc="EgoVLP extraction"):
        # Get video ID from path
        video_id = video_path.stem
        output_path = output_dir / f"{video_id}.npy"
        
        # Skip if already extracted
        if output_path.exists():
            continue
        
        try:
            features = extract_egovlp_features(video_path, egovlp_model, egovlp_processor, device)
            np.save(output_path, features)
        except Exception as e:
            print(f"Error processing {video_id}: {e}")
    
    # Verify extraction
    extracted_files = list(output_dir.glob('*.npy'))
    print(f"✓ EgoVLP: Extracted features for {len(extracted_files)} videos")
    
    # Load one file to check dimensions
    sample_feat = np.load(extracted_files[0])
    print(f"  Feature shape example: {sample_feat.shape}")
else:
    print("Skipping EgoVLP extraction")


## 6. PerceptionEncoder Feature Extraction


In [None]:
from transformers import VideoMAEModel, VideoMAEImageProcessor

# Setup PerceptionEncoder model (using VideoMAE as transformer-based video encoder)
if EXTRACT_PERCEPTION:
    print("Loading PerceptionEncoder model...")
    perception_model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
    perception_processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
    perception_model.eval()
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    perception_model = perception_model.to(device)
    print(f"✓ PerceptionEncoder model loaded on {device}")
else:
    print("Skipping PerceptionEncoder extraction")


In [None]:
def extract_perception_features(video_path, model, processor, device, num_frames=16):
    """Extract PerceptionEncoder features from video using VideoMAE."""
    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps if fps > 0 else 0
    
    # Sample 1-second segments
    num_segments = int(duration)
    features = []
    
    for seg_idx in range(num_segments):
        # Extract frames for this segment
        segment_frames = []
        for i in range(num_frames):
            frame_idx = int((seg_idx + i / num_frames) * fps)
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cap.read()
            
            if ret:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                segment_frames.append(frame_rgb)
        
        # If we couldn't get enough frames, pad with zeros
        if len(segment_frames) < num_frames:
            for _ in range(num_frames - len(segment_frames)):
                segment_frames.append(np.zeros_like(segment_frames[0]) if segment_frames else np.zeros((224, 224, 3), dtype=np.uint8))
        
        # Process frames
        inputs = processor(segment_frames, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Extract features
        with torch.no_grad():
            outputs = model(**inputs)
            # Use last_hidden_state mean pooled: [batch, seq_len, 768]
            feat = outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]
            features.append(feat)
    
    cap.release()
    return np.array(features)

print("✓ PerceptionEncoder extraction function defined")


In [None]:
if EXTRACT_PERCEPTION:
    output_dir = Path(OUTPUT_BASE) / 'perceptionencoder'
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Extracting PerceptionEncoder features for {len(video_files)} videos...")
    
    for video_path in tqdm(video_files, desc="PerceptionEncoder extraction"):
        video_id = video_path.stem
        output_path = output_dir / f"{video_id}.npy"
        
        # Skip if already extracted
        if output_path.exists():
            continue
        
        try:
            features = extract_perception_features(video_path, perception_model, perception_processor, device)
            np.save(output_path, features)
        except Exception as e:
            print(f"Error processing {video_id}: {e}")
    
    # Verify extraction
    extracted_files = list(output_dir.glob('*.npy'))
    print(f"✓ PerceptionEncoder: Extracted features for {len(extracted_files)} videos")
    
    # Load one file to check dimensions
    sample_feat = np.load(extracted_files[0])
    print(f"  Feature shape example: {sample_feat.shape}")
else:
    print("Skipping PerceptionEncoder extraction")


## 7. Upload Features to HuggingFace


In [None]:
if UPLOAD_TO_HF:
    from huggingface_hub import HfApi, create_repo, login
    from google.colab import userdata
    
    # Login to HuggingFace
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    
    # Create repository if it doesn't exist
    try:
        create_repo(repo_id=HF_DATASET_REPO, repo_type="dataset", exist_ok=True)
        print(f"✓ Repository {HF_DATASET_REPO} ready")
    except Exception as e:
        print(f"Note: {e}")
    
    api = HfApi()
    
    # Upload egovlp features
    if EXTRACT_EGOVLP:
        egovlp_dir = Path(OUTPUT_BASE) / 'egovlp'
        if egovlp_dir.exists():
            print("Uploading EgoVLP features...")
            api.upload_folder(
                folder_path=str(egovlp_dir),
                repo_id=HF_DATASET_REPO,
                repo_type="dataset",
                path_in_repo="egovlp/"
            )
            print("✓ EgoVLP features uploaded")
    
    # Upload perceptionencoder features
    if EXTRACT_PERCEPTION:
        perception_dir = Path(OUTPUT_BASE) / 'perceptionencoder'
        if perception_dir.exists():
            print("Uploading PerceptionEncoder features...")
            api.upload_folder(
                folder_path=str(perception_dir),
                repo_id=HF_DATASET_REPO,
                repo_type="dataset",
                path_in_repo="perceptionencoder/"
            )
            print("✓ PerceptionEncoder features uploaded")
    
    print(f"\n✅ All features uploaded to: https://huggingface.co/datasets/{HF_DATASET_REPO}")
else:
    print("Skipping HuggingFace upload")
