In [1]:
import cv2
import numpy as np
import mediapipe as mp
from mtcnn import MTCNN
import dlib
from scipy.signal import find_peaks

In [2]:
# возвращаем {"left_wrist": (x,y) или None, "right_wrist": (x,y) или None, "head_y": y или None}
def count_claps(landmarks, fps,
                dist_thresh_ratio=0.07,
                refractory_sec=0.25,
                above_head_margin=0.02):
    n = len(landmarks)
    if n == 0: return 0, []

    dist = []
    above_head = []
    for lm in landmarks:
        lw, rw, hy = lm["left_wrist"], lm["right_wrist"], lm["head_y"]
        if lw and rw:
            d = ((lw[0]-rw[0])**2 + (lw[1]-rw[1])**2)**0.5
            dist.append(d)
            if hy is not None and lw[1] < hy - above_head_margin and rw[1] < hy - above_head_margin:
                above_head.append(1)
            else:
                above_head.append(0)
        else:
            dist.append(np.nan)
            above_head.append(0)

    dist = np.array(dist)
    valid_idx = ~np.isnan(dist)
    if valid_idx.sum() < 5: return 0, []

    dist_interp = np.interp(np.arange(n), np.where(valid_idx)[0], dist[valid_idx])
    refractory_frames = int(refractory_sec * fps)

    inv = -dist_interp
    peaks, _ = find_peaks(inv, distance=refractory_frames, prominence=0.02)

    claps_frames = [p for p in peaks if dist_interp[p] < dist_thresh_ratio and above_head[p] == 1]

    return len(claps_frames), claps_frames

In [3]:
# MediaPipe
def mediapipe_landmarks(video_path):
    mp_pose = mp.solutions.pose
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    landmarks = []
    with mp_pose.Pose(model_complexity=1,
                      min_detection_confidence=0.5,
                      min_tracking_confidence=0.5) as pose:
        while True:
            ret, frame = cap.read()
            if not ret: break
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            res = pose.process(rgb)
            if res.pose_landmarks:
                lm = res.pose_landmarks.landmark
                landmarks.append({
                    "left_wrist": (lm[mp_pose.PoseLandmark.LEFT_WRIST].x, lm[mp_pose.PoseLandmark.LEFT_WRIST].y),
                    "right_wrist": (lm[mp_pose.PoseLandmark.RIGHT_WRIST].x, lm[mp_pose.PoseLandmark.RIGHT_WRIST].y),
                    "head_y": lm[mp_pose.PoseLandmark.NOSE].y
                })
            else:
                landmarks.append({"left_wrist":None,"right_wrist":None,"head_y":None})
    return landmarks, fps

In [5]:
# OpenCV (движение + лицо Haar)
def opencv_landmarks(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    fgbg = cv2.createBackgroundSubtractorMOG2()
    landmarks = []
    while True:
        ret, frame = cap.read()
        if not ret: break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.2, 5)
        if len(faces) > 0:
            x,y,w,h = faces[0]
            head_y = y/height
            roi = frame[0:y, :]
            mask = fgbg.apply(roi)
            cnts,_ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            blobs = [cv2.boundingRect(c) for c in cnts if cv2.contourArea(c) > 200]
            pts = []
            for bx,by,bw,bh in blobs[:2]:
                cx, cy = bx+bw//2, by+bh//2
                pts.append((cx/width, cy/height))
            wl = pts[0] if len(pts)>0 else None
            wr = pts[1] if len(pts)>1 else None
            landmarks.append({"left_wrist":wl,"right_wrist":wr,"head_y":head_y})
        else:
            landmarks.append({"left_wrist":None,"right_wrist":None,"head_y":None})
    return landmarks, fps

In [6]:
# Dlib (лицо + трекер)
def dlib_landmarks(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    detector = dlib.get_frontal_face_detector()
    landmarks = []
    while True:
        ret, frame = cap.read()
        if not ret: break
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        dets = detector(rgb, 1)
        if len(dets) > 0:
            d = dets[0]
            head_y = d.top()/height
            # имитация запястий: берём два случайных blob’а сверху (упрощённо)
            wl = (0.4, head_y-0.1)
            wr = (0.6, head_y-0.1)
            landmarks.append({"left_wrist":wl,"right_wrist":wr,"head_y":head_y})
        else:
            landmarks.append({"left_wrist":None,"right_wrist":None,"head_y":None})
    return landmarks, fps

In [7]:
# MTCNN
def mtcnn_landmarks(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    mtcnn = MTCNN()
    landmarks = []
    while True:
        ret, frame = cap.read()
        if not ret: break
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        faces = mtcnn.detect_faces(rgb)
        if len(faces) > 0:
            f = faces[0]
            x,y,w,h = f['box']
            head_y = y/height
            wl = (0.4, head_y-0.1)
            wr = (0.6, head_y-0.1)
            landmarks.append({"left_wrist":wl,"right_wrist":wr,"head_y":head_y})
        else:
            landmarks.append({"left_wrist":None,"right_wrist":None,"head_y":None})
    return landmarks, fps

In [8]:
# сравнение решений
def compare_methods(video_path):
    mp_landmarks, fps = mediapipe_landmarks(video_path)
    cv_landmarks, _ = opencv_landmarks(video_path)
    dl_landmarks, _ = dlib_landmarks(video_path)
    mt_landmarks, _ = mtcnn_landmarks(video_path)

    mp_count, _ = count_claps(mp_landmarks, fps)
    cv_count, _ = count_claps(cv_landmarks, fps)
    dl_count, _ = count_claps(dl_landmarks, fps)
    mt_count, _ = count_claps(mt_landmarks, fps)

    print("MediaPipe:", mp_count)
    print("OpenCV:", cv_count)
    print("Dlib:", dl_count)
    print("MTCNN:", mt_count)

In [9]:
!pip install pytube

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [15]:
from google.colab import files

# Откроется диалог выбора файла на твоём компьютере
uploaded = files.upload()

Saving clap_video.mp4 to clap_video.mp4


In [16]:
video_path = "clap_video.mp4"  # видео, в котором хлопают над головой 6 раз
compare_methods(video_path)



MediaPipe: 6
OpenCV: 6
Dlib: 0
MTCNN: 0


Вывод: MediaPipe справилась хорошо, что было ожидаемо, так как алгоритм работает с реальными лэндмарками тела и обучен на позах человека.

OpenCV тоже посчитал правильно (я от него после экспериментов с определением улыбки и открытости/закрытости глаз такого не ожидала). Но тут важно учесть тот факт, что девушка на видео стоит на месте и фон статичен.

Dlib скорее всего не смог корректно инициализировать трекеры для рук по моему коду, поэтому ничего не насчитал.

MTCNN ожидаемо не справился, так как это вообще детектор лиц, что с него возьмёшь..