In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

data_dir = Path("../data")

raw_dir = data_dir / "raw"
output_path = data_dir / "file_features.pkl"

In [None]:
import logging
import pandas as pd
import numpy as np

logging.basicConfig(level=logging.INFO)

In [None]:
from paddel import settings

### Load video paths

In [None]:
df = pd.DataFrame(raw_dir.iterdir(), columns=["video_path"])

### Drop if is not video

In [None]:
from paddel.preprocessing.video import is_video

indexes_to_keep = df[np.vectorize(is_video)(df["video_path"])].index

df = df.loc[indexes_to_keep]

### Filename features

In [None]:
from paddel.preprocessing import extract_filename_features

filename_features = pd.DataFrame.from_records(df["video_path"].apply(extract_filename_features))

df = pd.concat([df, filename_features], axis=1)

df.drop(df[df["group"] == -1].index, inplace=True)
df.drop(df[df["hand"] == -1].index, inplace=True)
df.drop(df[df["handedness"] == -1].index, inplace=True)

### Video framerate

In [None]:
from paddel.preprocessing.video import extract_video_framerate

df["framerate"] = df["video_path"].apply(extract_video_framerate)

### Landmarks

This could take a while depending on the hardware used

In [None]:
from paddel.preprocessing import extract_landmarks

df["landmarks"] = np.vectorize(extract_landmarks, otypes="O")(df["video_path"])
df["landmark_count"] = df["landmarks"].apply(len)
df["detection_time"] = df["landmark_count"] / df["framerate"]

df.drop(df[df["detection_time"] < settings.preprocessing.min_detection_seconds].index, inplace=True)

### Save DataFrame

In [None]:
df.to_pickle(output_path)