# HAND GESTURE-BASED INTERACTION

---

Group members:
*   Ada Yƒ±lmaz
*   Ceren ≈ûahin
*   Sima Adleyba
*   Selen Naz G√ºrsoy

### Installing necessary libraries and models

In [1]:
#install mediapipe
%pip install -q mediapipe

Note: you may need to restart the kernel to use updated packages.


In [2]:
#download a model that can recognize 7 hand gestures: üëç, üëé, ‚úåÔ∏è, ‚òùÔ∏è, ‚úä, üëã, ü§ü
!wget -q https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/1/gesture_recognizer.task

In [2]:
#download test images from pixabay
import urllib

IMAGE_FILENAMES = ['thumbs_down.jpg', 'victory.jpg', 'thumbs_up.jpg', 'pointing_up.jpg']

for name in IMAGE_FILENAMES:
  url = f'https://storage.googleapis.com/mediapipe-tasks/gesture_recognizer/{name}'
  urllib.request.urlretrieve(url, name)

### Functions for visualization

In [3]:
from matplotlib import pyplot as plt
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2

2024-12-10 20:13:00.651066: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
#some functions to visualize the gesture recognition results.
import math

plt.rcParams.update({
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.spines.left': False,
    'axes.spines.bottom': False,
    'xtick.labelbottom': False,
    'xtick.bottom': False,
    'ytick.labelleft': False,
    'ytick.left': False,
    'xtick.labeltop': False,
    'xtick.top': False,
    'ytick.labelright': False,
    'ytick.right': False
})

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles


def display_one_image(image, title, subplot, titlesize=16):
    """Displays one image along with the predicted category name and score."""
    plt.subplot(*subplot)
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize), color='black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)


def display_batch_of_images_with_gestures_and_hand_landmarks(images, results):
    """Displays a batch of images with the gesture category and its score along with the hand landmarks."""
    # Images and labels.
    images = [image.numpy_view() for image in images]
    gestures = [top_gesture for (top_gesture, _) in results]
    multi_hand_landmarks_list = [multi_hand_landmarks for (_, multi_hand_landmarks) in results]

    # Auto-squaring: this will drop data that does not fit into square or square-ish rectangle.
    rows = int(math.sqrt(len(images)))
    cols = len(images) // rows

    # Size and spacing.
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols, 1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))

    # Display gestures and hand landmarks.
    for i, (image, gestures) in enumerate(zip(images[:rows*cols], gestures[:rows*cols])):
        title = f"{gestures.category_name} ({gestures.score:.2f})"
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols) * 40 + 3
        annotated_image = image.copy()

        for hand_landmarks in multi_hand_landmarks_list[i]:
          hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
          hand_landmarks_proto.landmark.extend([
            landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
          ])

          mp_drawing.draw_landmarks(
            annotated_image,
            hand_landmarks_proto,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style())

        subplot = display_one_image(annotated_image, title, subplot, titlesize=dynamic_titlesize)

    # Layout.
    plt.tight_layout()
    plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

### Preview the images

In [5]:
import cv2
import math

DESIRED_HEIGHT = 480
DESIRED_WIDTH = 480

def resize_and_show(image, name):
    h, w = image.shape[:2]
    if h < w:
        img = cv2.resize(image, (DESIRED_WIDTH, math.floor(h / (w / DESIRED_WIDTH))))
    else:
        img = cv2.resize(image, (math.floor(w / (h / DESIRED_HEIGHT)), DESIRED_HEIGHT))
    
    # Display the image in a window with a name
    cv2.imshow(name, img)
    cv2.waitKey(0)  # Wait for a key press to close the window
    cv2.destroyAllWindows()  # Close the window after key press

# Example usage
images = {name: cv2.imread(name) for name in IMAGE_FILENAMES}

for name, image in images.items():
    if image is not None:
        print(f"Displaying: {name}")
        resize_and_show(image, name)
    else:
        print(f"Error: Could not read image {name}")


Displaying: thumbs_down.jpg
Displaying: victory.jpg
Displaying: thumbs_up.jpg
Displaying: pointing_up.jpg


### Gesture Detection Functions
After checking how the upper ones worked, we implemented fixed, more robust versions of gesture detection functions.

In [1]:
import time

# Global variables for tracking gestures and cooldown

# The currently detected gesture
current_gesture = None

# Time when the current gesture expires
gesture_reset_time = 0

# Cooldown variables for scrolling
previous_thumb_tip = None
previous_index_tip = None
last_gesture_time = 0
COOLDOWN_PERIOD = 1.5

# Reset current gesture when it expires
def reset_gesture():
    global current_gesture, gesture_reset_time
    if time.time() > gesture_reset_time:
        current_gesture = None

# Gesture detection functions
def detect_peace_sign(hand_landmarks):
    
    # Track the time of the last detected gesture
    global last_gesture_time

    # Get landmarks
    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    middle_tip = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    ring_tip = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_TIP]
    pinky_tip = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_TIP]

    index_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_MCP]
    middle_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_MCP]
    ring_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_MCP]
    pinky_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_MCP]

    # Check that index and middle are raised above their MCPs and other MCPs
    index_and_middle_up = (
        (index_tip.y < index_mcp.y) and
        (middle_tip.y < middle_mcp.y) and
        (index_tip.y < ring_mcp.y) and 
        (index_tip.y < pinky_mcp.y) and
        (middle_tip.y < ring_mcp.y) and 
        (middle_tip.y < pinky_mcp.y)
    )

    # Check spacing between index and middle fingers
    index_middle_spacing = abs(index_tip.x - middle_tip.x) > 0.1

    # Check that ring and pinky are down (their tips should be below their MCP joints)
    ring_and_pinky_down = (
        (ring_tip.y > ring_mcp.y + 0.02) and
        (pinky_tip.y > pinky_mcp.y + 0.02)
    )

    if index_and_middle_up and index_middle_spacing and ring_and_pinky_down:
        
        # Update last gesture time (to apply cooldown for peace sign gesture)
        last_gesture_time = time.time()
        return True
    return False



def detect_thumbs_up(hand_landmarks, margin=0.05):
    
    # Track the time of the last detected gesture
    global last_gesture_time 

    # Get landmarks
    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    middle_tip = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    ring_tip = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_TIP]
    pinky_tip = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_TIP]
    thumb_tip = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP]

    index_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_MCP]
    middle_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_MCP]
    ring_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_MCP]
    pinky_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_MCP]
    thumb_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_MCP]
    
    thumb_base = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_CMC]

    # Thumb tip should be above other fingertips
    thumb_tip_up = ((thumb_tip.y + margin < index_tip.y) and
                    (thumb_tip.y + margin < middle_tip.y) and
                    (thumb_tip.y + margin < ring_tip.y) and
                    (thumb_tip.y + margin < pinky_tip.y) and
                    (thumb_tip.y < thumb_mcp.y))
    
    # Other fingers should be in order from top to bottom
    other_fingers_ordered = ((index_mcp.y < middle_mcp.y) and
                             (middle_mcp.y < ring_mcp.y) and
                             (ring_mcp.y < pinky_mcp.y))
    
    
    if thumb_tip_up and other_fingers_ordered:
        
        # Update last gesture time (to apply cooldown for thumbs-up gesture)
        last_gesture_time = time.time()
        return True
    return False


def detect_thumbs_down(hand_landmarks, margin=0.05):
    
    # Track the time of the last detected gesture
    global last_gesture_time

    # Get landmarks
    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    middle_tip = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    ring_tip = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_TIP]
    pinky_tip = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_TIP]
    thumb_tip = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP]

    index_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_MCP]
    middle_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_MCP]
    ring_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_MCP]
    pinky_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_MCP]

    # Thumb tip must be lower than anything else
    thumb_tip_down = ((thumb_tip.y > index_tip.y + margin) and
                    (thumb_tip.y > middle_tip.y + margin) and
                    (thumb_tip.y > ring_tip.y + margin) and
                    (thumb_tip.y > pinky_tip.y + margin))
    
    # Other fingers should be in the order (from top to down) pinky > ring > middle > index finger
    other_fingers_ordered = ((index_mcp.y > middle_mcp.y) and
                             (middle_mcp.y > ring_mcp.y) and
                             (ring_mcp.y > pinky_mcp.y))
    
    if thumb_tip_down and other_fingers_ordered:
        
        # Update last gesture time (to apply cooldown for thumbs-down gesture)
        last_gesture_time = time.time()
        return True
    return False

def detect_scroll(hand_landmarks, threshold=0.1, dominance_ratio=4.0):
    
    # Get previous positions and last gesture time
    global previous_thumb_tip, previous_index_tip, last_gesture_time

    # Get current positions
    current_thumb_tip_x = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].x
    current_thumb_tip_y = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP].y
    current_index_tip_x = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x
    current_index_tip_y = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y

    # Check for cooldown
    current_time = time.time()
    
    # If last gesture happened not before cooldown, return False (no scroll)
    if current_time - last_gesture_time < COOLDOWN_PERIOD:
        return False, None 

    # Initialize previous positions if not set
    if previous_thumb_tip is None or previous_index_tip is None:
        previous_thumb_tip = (current_thumb_tip_x, current_thumb_tip_y)
        previous_index_tip = (current_index_tip_x, current_index_tip_y)
        return False, None

    # Calculate location changes
    thumb_horizontal_disp = current_thumb_tip_x - previous_thumb_tip[0]
    thumb_vertical_disp = current_thumb_tip_y - previous_thumb_tip[1]
    index_horizontal_disp = current_index_tip_x - previous_index_tip[0]
    index_vertical_disp = current_index_tip_y - previous_index_tip[1]

    # Average the movements of thumb and index for robustness
    horizontal_disp = (thumb_horizontal_disp + index_horizontal_disp) / 2
    vertical_disp = (thumb_vertical_disp + index_vertical_disp) / 2

    # Determine dominant movement (we want to return only a horizontal or vertical movement)
    horizontal_movement = abs(horizontal_disp) > threshold
    vertical_movement = abs(vertical_disp) > threshold

    # Check dominance
    if horizontal_movement and abs(horizontal_disp) > dominance_ratio * abs(vertical_disp):
        direction = "right" if horizontal_disp > 0 else "left"
        dominant_axis = "horizontal"
    elif vertical_movement and abs(vertical_disp) > dominance_ratio * abs(horizontal_disp):
        direction = "down" if vertical_disp > 0 else "up"
        dominant_axis = "vertical"
    else:
        direction = None
        dominant_axis = None

    # If there is a dominant movement
    if dominant_axis:
        
        # Update positions
        previous_thumb_tip = (current_thumb_tip_x, current_thumb_tip_y)
        previous_index_tip = (current_index_tip_x, current_index_tip_y)
        
        # Update last gesture time
        last_gesture_time = current_time 
        return True, direction

    return False, None

# Instagram-like Interface

In [2]:
import tkinter as tk
from PIL import Image, ImageTk
import os
import cv2
import random
import mediapipe as mp
import time

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

# Initialize Tkinter
root = tk.Tk()
root.geometry("1024x768")
root.title("Gesture-Controlled Instagram")

# Load Images
photo_folder = "Images/"
photo_files = [os.path.join(photo_folder, f) for f in os.listdir(photo_folder) if f.endswith((".jpg", ".png"))]
photos = [Image.open(photo).resize((400, 400)) for photo in photo_files]
random.shuffle(photos)  # Shuffle photos for randomness

# Initialize State
current_photo_index = 0
liked_photos = set()
disliked_photos = set()
saved_photos = set()
previous_thumb_tip = None
previous_index_tip = None
last_scroll_time = 0
SCROLL_COOLDOWN = 1.5  # Cooldown in seconds

# Display Area
photo_label = tk.Label(root, width=400, height=400)
photo_label.place(x=312, y=50)  # Center the photo in the window

# Feedback Label
feedback_label = tk.Label(root, text="Perform gestures to interact!", font=("Helvetica", 14), bg="#f0f0f0")
feedback_label.place(x=312, y=460)

# Buttons (Top Right)
like_button = tk.Button(root, text="Like ‚ù§Ô∏è", font=("Helvetica", 14), command=lambda: handle_like())
like_button.place(x=800, y=50, width=100, height=50)

dislike_button = tk.Button(root, text="Dislike üëé", font=("Helvetica", 14), command=lambda: handle_dislike())
dislike_button.place(x=800, y=110, width=100, height=50)

save_button = tk.Button(root, text="Save ‚úåÔ∏è", font=("Helvetica", 14), command=lambda: handle_save())
save_button.place(x=800, y=170, width=100, height=50)

# Cursor Label
cursor_label = tk.Label(root, text="‚¨§", fg="red", font=("Helvetica", 20))
cursor_label.place(x=0, y=0)

# Webcam Feed Area (Center Bottom)
webcam_frame = tk.Label(root, bg="black")
webcam_frame.place(x=312, y=500, width=400, height=200)

# Legend (Top Left)
legend = tk.Label(root, text=(
    "Legend:\n"
    "üëç -> Like Picture\n"
    "üëé -> Dislike Picture\n"
    "‚úåÔ∏è -> Save Picture\n"
    "‚òùÔ∏è -> Cursor\n"
    "ü§è -> Click\n"
    "üëÜ -> Scroll Right or Down (Next Picture)\n"
    "üëÜ -> Scroll Left or Up (Previous Picture)"
), font=("Helvetica", 10), justify="left", bg="#f0f0f0")
legend.place(x=20, y=50)

# Functions
def update_photo():
    """Update the displayed photo based on the index."""
    if 0 <= current_photo_index < len(photos):
        img = ImageTk.PhotoImage(photos[current_photo_index])
        photo_label.configure(image=img)
        photo_label.image = img

def handle_like():
    """Handle 'like' action."""
    global current_photo_index
    liked_photos.add(current_photo_index)
    feedback_label.config(text="Liked ‚ù§Ô∏è")

def handle_dislike():
    """Handle 'dislike' action."""
    global current_photo_index
    disliked_photos.add(current_photo_index)
    feedback_label.config(text="Disliked üëé")

def handle_save():
    """Handle 'save' action."""
    global current_photo_index
    saved_photos.add(current_photo_index)
    feedback_label.config(text="Saved ‚úåÔ∏è")

def scroll(direction):
    """Scroll through the photos."""
    global current_photo_index, last_scroll_time

    current_time = time.time()
    if current_time - last_scroll_time < SCROLL_COOLDOWN:
        return

    if direction in ["up", "left"] and current_photo_index > 0:
        current_photo_index -= 1
        feedback_label.config(text="Scrolled Up/Left ‚¨ÜÔ∏è‚¨ÖÔ∏è")
    elif direction in ["down", "right"] and current_photo_index < len(photos) - 1:
        current_photo_index += 1
        feedback_label.config(text="Scrolled Down/Right ‚¨áÔ∏è‚û°Ô∏è")

    update_photo()
    last_scroll_time = current_time

def is_click_gesture(hand_landmarks):
    """Detect a pinching gesture for a click."""
    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    thumb_tip = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP]

    # Calculate distance between index tip and thumb tip
    distance = ((index_tip.x - thumb_tip.x) ** 2 +
                (index_tip.y - thumb_tip.y) ** 2 +
                (index_tip.z - thumb_tip.z) ** 2) ** 0.5

    return distance < 0.05

# Gesture Detection Integration
cap = cv2.VideoCapture(0)

def detect_gestures():
    """Detect gestures and control the interface."""
    global cap

    ret, frame = cap.read()
    if not ret:
        root.after(100, detect_gestures)
        return

    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Cursor Control
            index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
            cursor_x = int(index_tip.x * root.winfo_width())
            cursor_y = int(index_tip.y * root.winfo_height())
            cursor_label.place(x=cursor_x, y=cursor_y)

            # Detect Gestures
            if detect_thumbs_up(hand_landmarks):
                handle_like()
            elif detect_thumbs_down(hand_landmarks):
                handle_dislike()
            elif detect_peace_sign(hand_landmarks):
                handle_save()
            elif is_click_gesture(hand_landmarks):
                # Simulate button click
                for widget in [like_button, dislike_button, save_button]:
                    widget_x = widget.winfo_x()
                    widget_y = widget.winfo_y()
                    widget_width = widget.winfo_width()
                    widget_height = widget.winfo_height()

                    if widget_x <= cursor_x <= widget_x + widget_width and widget_y <= cursor_y <= widget_y + widget_height:
                        widget.invoke()
            else:
                detected, direction = detect_scroll(hand_landmarks)
                if detected:
                    scroll(direction)

    # Update Webcam Feed
    aspect_ratio = frame.shape[1] / frame.shape[0]
    resized_width = 400
    resized_height = int(resized_width / aspect_ratio)
    frame_resized = cv2.resize(frame_rgb, (resized_width, resized_height))

    imgtk = ImageTk.PhotoImage(Image.fromarray(frame_resized))
    webcam_frame.imgtk = imgtk
    webcam_frame.configure(image=imgtk)

    root.after(10, detect_gestures)

# Start Application
update_photo()
detect_gestures()
root.mainloop()

# Release Camera
cap.release()
cv2.destroyAllWindows()


2024-12-10 20:14:07.323774: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1733850852.513290  252949 gl_context.cc:357] GL version: 2.1 (2.1 INTEL-22.5.10), renderer: Intel(R) Iris(TM) Plus Graphics OpenGL Engine
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1733850852.580856  253266 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733850852.635394  253266 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733850862.857608  253266 landmark_projection_calculator.cc:186] Using NORM_RECT witho