%% https://pytorch.org/get-started/previous-versions/#v212
pytorch==2.1.2 cpu

pip install -U openmim
mim install mmengine
pip install "mmcv==2.1.0"
mim install "mmdet==3.2.0"
mim install "mmpose==1.3.2"

In [1]:
import os
from tqdm.notebook import tqdm
from mmpose.apis import MMPoseInferencer

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib   # or pickle if you prefer

from HandPoseFeatureGenerator import HandPoseFeatureGenerator


# Step 1: Video Input

For that I am using a subset of the "Moments in Time Dataset" (http://moments.csail.mit.edu/)
We are looking at the classes: "cycling", "running", "eating" and "drinking".

# Step 2: Hand Pose Detection

In [4]:
# for file in folder
video_folder = "data/videos"
inferencer = MMPoseInferencer('hand')

def process_folder(folder_path, class_name):
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".mp4"):
            video_path = os.path.join(folder_path, filename)

            #result_generator = inferencer(video_path, pred_out_dir=f'data/results/predictions/{class_name}')
            result_generator = inferencer(video_path, out_dir=f'data/results_hands/{class_name}')
            _ = [result for result in result_generator]

        else:
            continue

process_folder(f"{video_folder}/cycling", "cycling")
process_folder(f"{video_folder}/drinking", "drinking")
process_folder(f"{video_folder}/eating", "eating")
process_folder(f"{video_folder}/running", "running")

Loads checkpoint by http backend from path: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-hand5_pt-aic-coco_210e-256x256-74fb594_20230320.pth
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmdet_nano_8xb32-300e_hand-267f9c8f.pth




  0%|          | 0/125 [00:00<?, ?it/s]

07/30 09:53:24 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T23a4hKJJGg_312.mp4
07/30 09:53:34 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-t2PfvQjK0oE_35.mp4
07/30 09:53:44 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T3-DFkhQENk_12.mp4
07/30 09:53:55 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-t3ivWNbEC1w_67.mp4
07/30 09:54:00 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T3tD4lxC3Fw_10.mp4
07/30 09:54:12 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visualizations\yt-T4D-rPtUMjI_74.mp4
07/30 09:54:23 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/cycling/visuali

  0%|          | 0/125 [00:00<?, ?it/s]

07/30 10:15:40 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--h-AQuIda5I_67.mp4
07/30 10:15:51 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--j3v4BGjotM_205.mp4
07/30 10:16:03 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--j3v4BGjotM_96.mp4
07/30 10:16:13 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--n9tx4fXzRY_12.mp4
07/30 10:16:23 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--n9tx4fXzRY_15.mp4
07/30 10:16:34 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/visualizations\yt--qro6BdiBC8_4.mp4
07/30 10:16:43 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/drinking/v

  0%|          | 0/125 [00:00<?, ?it/s]

07/30 10:38:50 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-KHhKroulAwo_62.mp4
07/30 10:39:01 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-khJqtSnBQRw_13.mp4
07/30 10:39:11 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-khJqtSnBQRw_33.mp4
07/30 10:39:24 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-kjNU-dbG3ek_779.mp4
07/30 10:39:38 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-koFOIvG6xDI_746.mp4
07/30 10:39:53 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizations\yt-kT7fE7RY54g_140.mp4
07/30 10:40:06 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/eating/visualizatio

  0%|          | 0/125 [00:00<?, ?it/s]

07/30 11:03:25 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt--mWS0O4HGuw_129.mp4
07/30 11:03:39 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt--mWS0O4HGuw_263.mp4
07/30 11:03:51 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-m-P8ge77FY4_296.mp4
07/30 11:04:03 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-McD6_oOWs-M_1555.mp4
07/30 11:04:15 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-mcWc1_mbzHE_881.mp4
07/30 11:04:26 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/visualizations\yt-mcWc1_mbzHE_903.mp4
07/30 11:04:37 - mmengine - [4m[97mINFO[0m - the output video has been saved at data/results_hands/running/v

# Step 3: Feature Encoding

In [2]:
extractor = HandPoseFeatureGenerator()

video_pred_list = []
label_list = []
#for class_name in ["cycling", "drinking", "eating", "running"]:
for class_name in ["drinking", "eating"]:
    class_folder = f"data/results_hands/{class_name}/predictions"
    for filename in tqdm(os.listdir(class_folder)):
        if filename.endswith(".json"):
            video_path = os.path.join(class_folder, filename)
            video_pred_list.append(video_path)
            label_list.append(class_name)

df = extractor.create_feature_dataframe(video_pred_list, labels=label_list)
df = df.fillna(0)
print(df.head())

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'frame_id': 0, 'instances': [{'keypoints': [[400.546875, 334.6875], [390.0, 333.515625], [387.65625, 337.03125], [385.3125, 349.921875], [387.65625, 342.890625], [400.546875, 342.890625], [399.375, 349.921875], [399.375, 349.921875], [399.375, 349.921875], [409.921875, 342.890625], [400.546875, 349.921875], [407.578125, 347.578125], [400.546875, 349.921875], [411.09375, 345.234375], [409.921875, 349.921875], [409.921875, 349.921875], [397.03125, 353.4375], [407.578125, 349.921875], [407.578125, 353.4375], [409.921875, 349.921875], [401.71875, 355.78125]], 'keypoint_scores': [0.4644227921962738, 0.43923962116241455, 0.41723084449768066, 0.40075793862342834, 0.4110822379589081, 0.41515976190567017, 0.4035833477973938, 0.3822660446166992, 0.38808387517929077, 0.44911110401153564, 0.38341355323791504, 0.38389405608177185, 0.40536317229270935, 0.4556429386138916, 0.4153382182121277, 0.4077465534210205, 0.41827860474586487, 0.40538230538368225, 0.40953385829925537, 0.40242236852645874, 0.41

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'frame_id': 0, 'instances': [{'keypoints': [[427.5, 355.78125], [423.984375, 348.75], [422.8125, 344.0625], [425.15625, 340.546875], [415.78125, 333.515625], [432.1875, 341.71875], [429.84375, 338.203125], [427.5, 334.6875], [418.125, 334.6875], [435.703125, 342.890625], [434.53125, 338.203125], [426.328125, 333.515625], [351.328125, 333.515625], [438.046875, 345.234375], [434.53125, 342.890625], [422.8125, 337.03125], [419.296875, 342.890625], [436.875, 351.09375], [433.359375, 346.40625], [432.1875, 345.234375], [429.84375, 342.890625]], 'keypoint_scores': [0.5500683188438416, 0.5254911184310913, 0.45243704319000244, 0.3940100073814392, 0.35640156269073486, 0.496803879737854, 0.4737575054168701, 0.3688969612121582, 0.28410324454307556, 0.4610479474067688, 0.39013826847076416, 0.2798492908477783, 0.1770155131816864, 0.43303531408309937, 0.343168169260025, 0.2690073251724243, 0.2005230337381363, 0.393959105014801, 0.3270178437232971, 0.2544415295124054, 0.18038463592529297], 'bbox': [

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Original frames: 21395
Frames after dropping empty instances: 8948
Dropped 12447 empty frames
    thumb_extension  index_extension  middle_extension  ring_extension  \
83         0.120726         0.086208          0.044016        0.069763   
86         0.125407         0.068744          0.025455        0.076375   
87         0.113716         0.172928          0.000000        0.000000   
90         0.819596         0.558644          0.597635        0.511396   
91         0.762597         0.000000          0.000000        0.419247   

    pinky_extension  fingers_extended_count  avg_finger_extension  \
83         0.095660                     0.0              0.083275   
86         0.087199                     0.0              0.076636   
87         0.056301                     0.0              0.068589   
90         0.382543                     4.0              0.573963   
91         0.000000                     1.0              0.236369   

    pinch_distance  is_pinching  frame_id  \
8

# Step 4: ML Model Training

In [3]:
X = df.drop(columns=["label"])
y = df["label"]

# Drop non‑informative IDs:
X = X.drop(columns=[c for c in X.columns if c.lower().endswith(("frame_id", "video_path", "video_filename"))], errors="ignore")

# Drop rows with NaNs
na_mask = X.notna().all(axis=1)     # True for rows with *no* NaNs
X = X[na_mask].copy()
y = y[na_mask].copy()


# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Build a pipeline:  StandardScaler ➜ linear SVM

clf = make_pipeline(
    StandardScaler(),
    MLPClassifier(max_iter=500, early_stopping=True)
)

clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred).round(3))
print(classification_report(y_test, y_pred))

joblib.dump(clf, "feedforward_model.joblib")


Accuracy: 0.629
              precision    recall  f1-score   support

    drinking       0.61      0.75      0.67       916
      eating       0.66      0.50      0.57       874

    accuracy                           0.63      1790
   macro avg       0.64      0.63      0.62      1790
weighted avg       0.63      0.63      0.62      1790



['feedforward_model.joblib']

# Step 5: Gesture Prediction