%% https://pytorch.org/get-started/previous-versions/#v212
pytorch==2.1.2 cpu

pip install -U openmim
mim install mmengine
pip install "mmcv==2.1.0"
mim install "mmdet==3.2.0"
mim install "mmpose==1.3.2"

In [17]:
import json
import os
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import HTML, display, Video, JSON
from mmpose.apis import MMPoseInferencer

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib  # or pickle if you prefer

from HandPoseFeatureGenerator import HandPoseFeatureGenerator


# Step 1: Video Input

For that I am using a subset of the "Moments in Time Dataset" (http://moments.csail.mit.edu/)
We are looking at the classes: "cycling", "running", "eating" and "drinking".

In [14]:
Video("data/videos/drinking/yt--j3v4BGjotM_96.mp4", embed=True, html_attributes="controls loop")

In [15]:
Video("data/videos/eating/yt-kT7fE7RY54g_140.mp4", embed=True, html_attributes="controls loop")

# Step 2: Hand Pose Detection

In [4]:
# for file in folder
video_folder = "data/videos"
inferencer = MMPoseInferencer('hand')


def process_folder(folder_path, class_name):
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".mp4"):
            video_path = os.path.join(folder_path, filename)

            result_generator = inferencer(video_path, out_dir=f'data/results_hands/{class_name}')
            _ = [result for result in result_generator]

        else:
            continue


process_folder(f"{video_folder}/cycling", "cycling")
process_folder(f"{video_folder}/drinking", "drinking")
process_folder(f"{video_folder}/eating", "eating")
process_folder(f"{video_folder}/running", "running")

Loads checkpoint by http backend from path: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-hand5_pt-aic-coco_210e-256x256-74fb594_20230320.pth
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmdet_nano_8xb32-300e_hand-267f9c8f.pth




  0%|          | 0/125 [00:00<?, ?it/s]

07/30 09:53:24 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T23a4hKJJGg_312.mp4
07/30 09:53:34 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-t2PfvQjK0oE_35.mp4
07/30 09:53:44 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T3-DFkhQENk_12.mp4
07/30 09:53:55 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-t3ivWNbEC1w_67.mp4
07/30 09:54:00 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T3tD4lxC3Fw_10.mp4
07/30 09:54:12 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T4D-rPtUMjI_74.mp4
07/30 09:54:23 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visuali

  0%|          | 0/125 [00:00<?, ?it/s]

07/30 10:15:40 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--h-AQuIda5I_67.mp4
07/30 10:15:51 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--j3v4BGjotM_205.mp4
07/30 10:16:03 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--j3v4BGjotM_96.mp4
07/30 10:16:13 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--n9tx4fXzRY_12.mp4
07/30 10:16:23 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--n9tx4fXzRY_15.mp4
07/30 10:16:34 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--qro6BdiBC8_4.mp4
07/30 10:16:43 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/v

  0%|          | 0/125 [00:00<?, ?it/s]

07/30 10:38:50 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-KHhKroulAwo_62.mp4
07/30 10:39:01 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-khJqtSnBQRw_13.mp4
07/30 10:39:11 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-khJqtSnBQRw_33.mp4
07/30 10:39:24 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-kjNU-dbG3ek_779.mp4
07/30 10:39:38 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-koFOIvG6xDI_746.mp4
07/30 10:39:53 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-kT7fE7RY54g_140.mp4
07/30 10:40:06 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizatio

  0%|          | 0/125 [00:00<?, ?it/s]

07/30 11:03:25 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt--mWS0O4HGuw_129.mp4
07/30 11:03:39 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt--mWS0O4HGuw_263.mp4
07/30 11:03:51 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-m-P8ge77FY4_296.mp4
07/30 11:04:03 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-McD6_oOWs-M_1555.mp4
07/30 11:04:15 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-mcWc1_mbzHE_881.mp4
07/30 11:04:26 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-mcWc1_mbzHE_903.mp4
07/30 11:04:37 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/v

In [None]:
Video("data/results_hands/drinking/visualizations/yt--j3v4BGjotM_96.mp4", embed=True, html_attributes="controls loop")

In [18]:
JSON("data/results_hands/drinking/predictions/yt--j3v4BGjotM_96.json")

<IPython.core.display.JSON object>

In [None]:
Video("data/results_hands/eating/visualizations/yt-kT7fE7RY54g_140.mp4", embed=True, html_attributes="controls loop")


In [19]:
JSON("data/results_hands/eating/predictions/yt-kT7fE7RY54g_140.json")

<IPython.core.display.JSON object>

# Step 3: Feature Encoding

In [3]:
extractor = HandPoseFeatureGenerator()

video_pred_list = []
label_list = []
#for class_name in ["cycling", "drinking", "eating", "running"]:
for class_name in ["drinking", "eating"]:
    class_folder = f"data/results_hands/{class_name}/predictions"
    for filename in tqdm(os.listdir(class_folder)):
        if filename.endswith(".json"):
            video_path = os.path.join(class_folder, filename)
            video_pred_list.append(video_path)
            label_list.append(class_name)

df = extractor.create_feature_dataframe(video_pred_list, labels=label_list)
df = df.fillna(0)

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Original frames: 21395
Frames after dropping empty instances: 8948
Dropped 12447 empty frames


In [4]:
html_str = df.head().to_html()  # raw HTML as a string

display(HTML(html_str))

Unnamed: 0,thumb_extension,index_extension,middle_extension,ring_extension,pinky_extension,fingers_extended_count,avg_finger_extension,pinch_distance,is_pinching,frame_id,video_path,video_filename,label
83,0.120726,0.086208,0.044016,0.069763,0.09566,0.0,0.083275,26.49018,1.0,83,data/results_hands/drinking/predictions\yt--h-AQuIda5I_67.json,predictions\yt--h-AQuIda5I_67.json,drinking
86,0.125407,0.068744,0.025455,0.076375,0.087199,0.0,0.076636,25.030371,1.0,86,data/results_hands/drinking/predictions\yt--h-AQuIda5I_67.json,predictions\yt--h-AQuIda5I_67.json,drinking
87,0.113716,0.172928,0.0,0.0,0.056301,0.0,0.068589,0.0,0.0,87,data/results_hands/drinking/predictions\yt--h-AQuIda5I_67.json,predictions\yt--h-AQuIda5I_67.json,drinking
90,0.819596,0.558644,0.597635,0.511396,0.382543,4.0,0.573963,67.097155,0.0,0,data/results_hands/drinking/predictions\yt--j3v4BGjotM_205.json,predictions\yt--j3v4BGjotM_205.json,drinking
91,0.762597,0.0,0.0,0.419247,0.0,1.0,0.236369,0.0,0.0,1,data/results_hands/drinking/predictions\yt--j3v4BGjotM_205.json,predictions\yt--j3v4BGjotM_205.json,drinking


# Step 4: ML Model Training

In [8]:
X = df.drop(columns=["label"])
y = df["label"]

# Drop non‑informative IDs:
X = X.drop(columns=[c for c in X.columns if c.lower().endswith(("frame_id", "video_path", "video_filename"))],
           errors="ignore")

# Drop rows with NaNs
na_mask = X.notna().all(axis=1)  # True for rows with *no* NaNs
X = X[na_mask].copy()
y = y[na_mask].copy()

# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Build a pipeline:  StandardScaler ➜ linear SVM

clf = make_pipeline(
    StandardScaler(),
    MLPClassifier(max_iter=500, early_stopping=True)
)

clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred).round(3))
print(classification_report(y_test, y_pred))

joblib.dump(clf, "data/models/feedforward_model.joblib")


Accuracy: 0.636
              precision    recall  f1-score   support

    drinking       0.62      0.75      0.68       916
      eating       0.67      0.51      0.58       874

    accuracy                           0.64      1790
   macro avg       0.64      0.63      0.63      1790
weighted avg       0.64      0.64      0.63      1790



['data/models/feedforward_model.joblib']

In [11]:
result = permutation_importance(
    clf, X_test, y_test,
    scoring="accuracy",  # global accuracy
    n_repeats=15,
    random_state=42,
)

imp = pd.Series(result.importances_mean, index=X.columns)
print(imp.sort_values(ascending=False).head(20))

is_pinching               0.065587
thumb_extension           0.062495
index_extension           0.050987
fingers_extended_count    0.048827
middle_extension          0.045102
ring_extension            0.035084
pinch_distance            0.033929
avg_finger_extension      0.033259
pinky_extension           0.018808
dtype: float64


# Step 5: Gesture Prediction

See SimpleLiveActionClassifier.py