In [1]:
!pip install moviepy librosa --quiet


In [1]:
import os
import math
import tempfile

import numpy as np
import pandas as pd

import cv2
from moviepy.editor import VideoFileClip

import librosa

import torch
import torch.nn as nn
from torchvision import models, transforms

# Paths for Kaggle
VIDEO_DIR = "/kaggle/input/engagementvids"
META_PATH = "/kaggle/input/metadata/metadata.csv.txt"
OUTPUT_DIR = "/kaggle/working"

print(os.listdir(VIDEO_DIR))
print(os.listdir("/kaggle/input/metadata"))


error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu

['sub01.mp4', 'sub04.mp4', 'sub06.mp4', 'sub02.mp4', 'sub03.mp4', 'sub05.mp4']
['metadata.csv.txt']


In [2]:
df = pd.read_csv(META_PATH)

print("Raw metadata:")
display(df)

# Basic cleaning: strip spaces
for col in ["subject", "file", "segment"]:
    df[col] = df[col].astype(str).str.strip()

# Fix any spaces in file names like 'sub01.m p4' -> 'sub01.mp4'
df["file"] = df["file"].str.replace(" ", "", regex=False)

# Convert start/end to numeric
df["start_s"] = pd.to_numeric(df["start_s"], errors="coerce")
df["end_s"] = pd.to_numeric(df["end_s"], errors="coerce")
df["self_report"] = pd.to_numeric(df["self_report"], errors="coerce")

df = df.dropna(subset=["start_s", "end_s", "self_report"]).reset_index(drop=True)

print("\nCleaned metadata:")
display(df)


Raw metadata:


Unnamed: 0,subject,file,segment,start_s,end_s,self_report
0,sub01,sub01.mp4,high,3,31,5
1,sub01,sub01.mp4,low,33,82,2
2,sub01,sub01.mp4,talk,82,107,3
3,sub01,sub01.mp4,talk,107,146,4
4,sub01,sub01.mp4,read,146,180,2
5,sub01,sub01.mp4,idle,180,211,1
6,sub02,sub02.mp4,high,2,48,5
7,sub02,sub02.mp4,mid,48,63,3
8,sub02,sub02.mp4,high,63,88,4
9,sub02,sub02.mp4,read,88,113,3



Cleaned metadata:


Unnamed: 0,subject,file,segment,start_s,end_s,self_report
0,sub01,sub01.mp4,high,3,31,5
1,sub01,sub01.mp4,low,33,82,2
2,sub01,sub01.mp4,talk,82,107,3
3,sub01,sub01.mp4,talk,107,146,4
4,sub01,sub01.mp4,read,146,180,2
5,sub01,sub01.mp4,idle,180,211,1
6,sub02,sub02.mp4,high,2,48,5
7,sub02,sub02.mp4,mid,48,63,3
8,sub02,sub02.mp4,high,63,88,4
9,sub02,sub02.mp4,read,88,113,3


In [3]:
def score_to_engagement(score):
    if score in [1, 2]:
        return "low"
    elif score == 3:
        return "mid"
    else:
        return "high"

df["engagement_class"] = df["self_report"].apply(score_to_engagement)
df["duration"] = df["end_s"] - df["start_s"]

print(df[["subject", "file", "segment", "start_s", "end_s", "self_report", "engagement_class", "duration"]])


   subject       file segment  start_s  end_s  self_report engagement_class  \
0    sub01  sub01.mp4    high        3     31            5             high   
1    sub01  sub01.mp4     low       33     82            2              low   
2    sub01  sub01.mp4    talk       82    107            3              mid   
3    sub01  sub01.mp4    talk      107    146            4             high   
4    sub01  sub01.mp4    read      146    180            2              low   
5    sub01  sub01.mp4    idle      180    211            1              low   
6    sub02  sub02.mp4    high        2     48            5             high   
7    sub02  sub02.mp4     mid       48     63            3              mid   
8    sub02  sub02.mp4    high       63     88            4             high   
9    sub02  sub02.mp4    read       88    113            3              mid   
10   sub02  sub02.mp4    talk      113    146            4             high   
11   sub02  sub02.mp4    idle      146    180       

In [4]:
# ========= AUDIO HELPERS ========= #

def extract_audio_segment(video_path, start_s, end_s, sr=22050):
    """
    Extract audio between start_s and end_s from video.
    Returns (y, sr) as in librosa.load.
    """
    clip = VideoFileClip(video_path).subclip(start_s, end_s)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
        # Save temp audio file
        clip.audio.write_audiofile(tmp.name, verbose=False, logger=None)
        y, sr = librosa.load(tmp.name, sr=sr)

    clip.close()
    return y, sr


def extract_audio_features(y, sr):
    """
    Compute summary audio features (MFCC, spectral, chroma).
    Returns: (features_array, feature_names_list)
    """
    feats = []
    names = []

    if len(y) == 0:
        # fallback
        return np.zeros(40, dtype=np.float32), [f"audio_feat_{i}" for i in range(40)]

    # 1. MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_std = mfcc.std(axis=1)

    for i in range(13):
        feats.append(mfcc_mean[i]); names.append(f"mfcc_{i+1}_mean")
        feats.append(mfcc_std[i]);  names.append(f"mfcc_{i+1}_std")

    # 2. Spectral centroid
    spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    feats.append(spec_centroid.mean()); names.append("spec_centroid_mean")
    feats.append(spec_centroid.std());  names.append("spec_centroid_std")

    # 3. Spectral bandwidth
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    feats.append(spec_bw.mean()); names.append("spec_bw_mean")
    feats.append(spec_bw.std());  names.append("spec_bw_std")

    # 4. Spectral rolloff
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    feats.append(rolloff.mean()); names.append("rolloff_mean")
    feats.append(rolloff.std());  names.append("rolloff_std")

    # 5. Zero-crossing rate
    zcr = librosa.feature.zero_crossing_rate(y)
    feats.append(zcr.mean()); names.append("zcr_mean")
    feats.append(zcr.std());  names.append("zcr_std")

    # 6. Chroma STFT
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = chroma.mean(axis=1)
    chroma_std = chroma.std(axis=1)

    for i in range(chroma.shape[0]):
        feats.append(chroma_mean[i]); names.append(f"chroma_{i+1}_mean")
        feats.append(chroma_std[i]);  names.append(f"chroma_{i+1}_std")

    return np.array(feats, dtype=np.float32), names


In [5]:
# ========= VIDEO FEATURE EXTRACTOR ========= #

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Pretrained ResNet18
resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
resnet18.eval()

# Remove classification head → 512-dim features
feature_extractor = nn.Sequential(*list(resnet18.children())[:-1])
feature_extractor.to(device)
feature_extractor.eval()

# Preprocessing pipeline for images
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])


Using device: cuda


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 181MB/s]


In [6]:
def extract_video_features(video_path, start_s, end_s, num_frames=16):
    """
    Sample frames between start_s and end_s, get ResNet18 features,
    and average them into one 512-d vector.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Could not open video:", video_path)
        return np.zeros(512, dtype=np.float32), [f"vfeat_{i}" for i in range(512)]

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if fps <= 0 or total_frames <= 0:
        cap.release()
        return np.zeros(512, dtype=np.float32), [f"vfeat_{i}" for i in range(512)]

    segment_start_frame = int(start_s * fps)
    segment_end_frame = int(end_s * fps)

    segment_start_frame = max(0, segment_start_frame)
    segment_end_frame = min(total_frames - 1, segment_end_frame)

    if segment_end_frame <= segment_start_frame:
        cap.release()
        return np.zeros(512, dtype=np.float32), [f"vfeat_{i}" for i in range(512)]

    indices = np.linspace(segment_start_frame, segment_end_frame, num_frames).astype(int)

    frame_feats = []

    with torch.no_grad():
        for idx in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                continue

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            inp = preprocess(frame_rgb).unsqueeze(0).to(device)

            feat = feature_extractor(inp)
            feat = feat.view(feat.size(0), -1)  # (1, 512)
            frame_feats.append(feat.cpu().numpy()[0])

    cap.release()

    if len(frame_feats) == 0:
        return np.zeros(512, dtype=np.float32), [f"vfeat_{i}" for i in range(512)]

    frame_feats = np.stack(frame_feats, axis=0)
    segment_feat = frame_feats.mean(axis=0)

    names = [f"vfeat_{i}" for i in range(len(segment_feat))]
    return segment_feat.astype(np.float32), names


In [7]:
all_rows = []
feature_names_audio = None
feature_names_video = None

segment_types = ["high", "mid", "low", "read", "talk", "idle"]
segment_one_hot_names = [f"segment_{s}" for s in segment_types]

for idx, row in df.iterrows():
    subject = row["subject"]
    file_name = row["file"]
    segment_type = row["segment"]
    start_s = float(row["start_s"])
    end_s = float(row["end_s"])
    score = int(row["self_report"])
    engagement_class = row["engagement_class"]
    duration = row["duration"]

    video_path = os.path.join(VIDEO_DIR, file_name)

    print(f"[{idx+1}/{len(df)}] {subject} {file_name} {segment_type} {start_s}-{end_s}")

    # ---- AUDIO FEATURES ---- #
    try:
        y, sr = extract_audio_segment(video_path, start_s, end_s)
        audio_feats, audio_names = extract_audio_features(y, sr)
    except Exception as e:
        print("Audio error:", e)
        audio_feats = np.zeros(40, dtype=np.float32)
        audio_names = [f"audio_feat_{i}" for i in range(len(audio_feats))]

    if feature_names_audio is None:
        feature_names_audio = audio_names

    # ---- VIDEO FEATURES ---- #
    try:
        video_feats, video_names = extract_video_features(video_path, start_s, end_s, num_frames=16)
    except Exception as e:
        print("Video error:", e)
        video_feats = np.zeros(512, dtype=np.float32)
        video_names = [f"vfeat_{i}" for i in range(len(video_feats))]

    if feature_names_video is None:
        feature_names_video = video_names

    # ---- SEGMENT TYPE ONE-HOT ---- #
    segment_one_hot = [1 if segment_type == s else 0 for s in segment_types]

    # Combine numeric features
    numeric_features = np.concatenate([
        np.array([duration], dtype=np.float32),
        audio_feats,
        video_feats,
        np.array(segment_one_hot, dtype=np.float32)
    ])

    if idx == 0:
        feature_cols = (
            ["duration"] +
            feature_names_audio +
            feature_names_video +
            segment_one_hot_names
        )

    row_dict = {
        "subject": subject,
        "file": file_name,
        "segment": segment_type,
        "start_s": start_s,
        "end_s": end_s,
        "self_report": score,
        "engagement_class": engagement_class
    }

    for name, val in zip(feature_cols, numeric_features):
        row_dict[name] = val

    all_rows.append(row_dict)

features_df = pd.DataFrame(all_rows)
print("Features dataframe shape:", features_df.shape)
display(features_df.head())


[1/34] sub01 sub01.mp4 high 3.0-31.0
[2/34] sub01 sub01.mp4 low 33.0-82.0
[3/34] sub01 sub01.mp4 talk 82.0-107.0
[4/34] sub01 sub01.mp4 talk 107.0-146.0
[5/34] sub01 sub01.mp4 read 146.0-180.0
[6/34] sub01 sub01.mp4 idle 180.0-211.0
[7/34] sub02 sub02.mp4 high 2.0-48.0
[8/34] sub02 sub02.mp4 mid 48.0-63.0
[9/34] sub02 sub02.mp4 high 63.0-88.0
[10/34] sub02 sub02.mp4 read 88.0-113.0
[11/34] sub02 sub02.mp4 talk 113.0-146.0
[12/34] sub02 sub02.mp4 idle 146.0-180.0
[13/34] sub03 sub03.mp4 high 3.0-39.0
[14/34] sub03 sub03.mp4 high 39.0-73.0
[15/34] sub03 sub03.mp4 read 73.0-108.0
[16/34] sub03 sub03.mp4 talk 108.0-155.0
[17/34] sub03 sub03.mp4 idle 155.0-182.0
[18/34] sub04 sub04.mp4 mid 3.0-25.0
[19/34] sub04 sub04.mp4 low 25.0-52.0
[20/34] sub04 sub04.mp4 high 52.0-64.0
[21/34] sub04 sub04.mp4 low 64.0-94.0
[22/34] sub04 sub04.mp4 read 94.0-124.0
[23/34] sub04 sub04.mp4 talk 124.0-162.0
[24/34] sub04 sub04.mp4 idle 162.0-188.0
[25/34] sub05 sub05.mp4 high 2.0-53.0
[26/34] sub05 sub05.mp

Unnamed: 0,subject,file,segment,start_s,end_s,self_report,engagement_class,duration,mfcc_1_mean,mfcc_1_std,...,vfeat_508,vfeat_509,vfeat_510,vfeat_511,segment_high,segment_mid,segment_low,segment_read,segment_talk,segment_idle
0,sub01,sub01.mp4,high,3.0,31.0,5,high,28.0,-330.282166,38.383305,...,0.166632,0.802956,0.132247,0.196261,1.0,0.0,0.0,0.0,0.0,0.0
1,sub01,sub01.mp4,low,33.0,82.0,2,low,49.0,-377.832336,44.851612,...,0.210868,0.912421,0.103162,0.284469,0.0,0.0,1.0,0.0,0.0,0.0
2,sub01,sub01.mp4,talk,82.0,107.0,3,mid,25.0,-379.541992,44.595966,...,0.134236,0.938361,0.154042,0.236088,0.0,0.0,0.0,0.0,1.0,0.0
3,sub01,sub01.mp4,talk,107.0,146.0,4,high,39.0,-383.676666,37.099281,...,0.141579,1.036656,0.223603,0.341036,0.0,0.0,0.0,0.0,1.0,0.0
4,sub01,sub01.mp4,read,146.0,180.0,2,low,34.0,-378.465607,40.46032,...,0.127626,0.803469,0.11245,0.308088,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
features_csv_path = os.path.join(OUTPUT_DIR, "featuresN.csv")
features_df.to_csv(features_csv_path, index=False)

print("Saved features to:", features_csv_path)


Saved features to: /kaggle/working/featuresN.csv
