# Project Overview

## Detecção de objetos

Há um grande interesse de utilizar técnicas de machine learning para detecção de objetos. Rastreamento de objetos em tempo real (real-time object tracking).

Há diversas aplicações como:
- Realidade aumentada
    - https://www.youtube.com/watch?v=tJbtp1Cv1gY
    - https://www.youtube.com/watch?v=aWI39cCNo7I
    
- Criação de filtros (estilo snapchat e TikTok)
- Detecção de poses:
    - [Conseguimos controlar um avatar de acordo com os nosso movimentos] (https://www.youtube.com/watch?v=8Va3_jwYOJU)
    - [Melhorar a performance de atletas] (https://www.youtube.com/watch?v=-LkMOGvbn_c)

## MediaPipe

Modulo desenvolvido pela Google. Consiste em um módulo de detecção.  
Facilidade de implementar soluções sofisticadas de forma simples e ágil.  
[Diversas soluções](https://google.github.io/mediapipe/)
- Detecção de faces
- Poses
- Detecção de mãos
- E muito mais

[Projeto Github](https://github.com/google/mediapipe)

# Install and load Libraries

In [None]:
!pip install opencv-python
!pip install mediapipe

In [16]:
# Data and math operations
import re
from math import hypot
import numpy as np
import pandas as pd

# Image processing
import cv2
import mediapipe as mp

# Image visualizations
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Audio Speech
import pyttsx3

# For opening images via URL
from PIL.Image import open as open_image
from urllib.request import urlopen

# Display images in Jupyter Notebook
from IPython.display import Image

# macOS alternative to pycaw
import osascript

# Control the flow of time
import time

# For docstrings
from typing import List

## Defines

In [3]:
# mediapipe instances
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

# Landmark styles
HAND_CONNECTIONS = mp_hands.HAND_CONNECTIONS
DEFAULT_LANDMARK_POINTS = mp_drawing_styles.get_default_hand_landmarks_style()
DEFAULT_LANDMARK_CONNECTIONS = mp_drawing_styles.get_default_hand_connections_style()
BASIC_LANDMARK_POINTS_SPEC = mp_drawing.DrawingSpec(color=(22,150,210), thickness=2, circle_radius=2)
BASIC_LANDMARK_CONNECTION_SPEC = mp_drawing.DrawingSpec(color=(253,191,17), thickness=1, circle_radius=2)

# Mapping of handpoints
_hand_point_names = [ 
                     ['THUMB_'         + i for i in ['CMC','MCP','IP','TIP']],
                     ['INDEX_FINGER_'  + i for i in ['MCP','PIP','DIP','TIP']],
                     ['MIDDLE_FINGER_' + i for i in ['MCP','PIP','DIP','TIP']],
                     ['RING_FINGER_'   + i for i in ['MCP','PIP','DIP','TIP']],
                     ['PINKY_'         + i for i in ['MCP','PIP','DIP','TIP']]
                    ]
_hand_point_names  = ['WRIST'] + [value for sublist in _hand_point_names for value in sublist]

HAND_POINT_MAPPING = {index:_hand_point_names[index] for index in range(21)}

# Terminal ASCII colors
WHITE = '\033[39m'
CYAN = '\033[36m'
GREEN = '\033[32m'

# Custom functions

## Draw landmarks

In [4]:
# Draw landmarks in image with mediapipe default style
def draw_landmarks(image, landmarks, connections) -> None:
    mp_drawing.draw_landmarks(image, 
                landmarks,
                connections,
                DEFAULT_LANDMARK_POINTS,
                DEFAULT_LANDMARK_CONNECTIONS)

## Map hand positions

In [5]:
# Use points coordinates to map hand positions
def map_hand_coord(hand_position:list) -> dict():
    _hand_coord = [(i[1], i[2]) for i in hand_position]
    return {HAND_POINT_MAPPING[index]:coord for index, coord in enumerate(_hand_coord)}

## Get current hand in loop

In [6]:
# Extract str value in mp_hands_results.multi_handedness object
get_current_hand = lambda multi_handedness: re.findall('"([^"]*)"', str(multi_handedness))[0]

## Get hand coordinates

In [7]:
def find_hand_position(image, hand_landmarks):
    '''
    Get hand points coordinates in image from a landmark object.
    Parameters:
    @ image (numpy.ndarray): coordinates of point 1
    @ hand_landmarks (mediapipe.framework.formats.landmark_pb2.NormalizedLandmarkList): hand landmark object to extract results
    Returns:
        list: coordinates for each hand point
    '''
    height, width, channel = image.shape 
    landmark_results = []
    
    for _id, landmark in enumerate(hand_landmarks.landmark):
        x_coord, y_coord = int(landmark.x * width), int(landmark.y * height)
        landmark_results.append([_id, x_coord, y_coord]
        
    return landmark_results

## Calculate distance between two points

In [8]:
def get_points_distance(coord_1, coord_2, draw_points=None):
    '''
    Calculate distance with `hypot` between two hand points.
    Parameters:
    @ coord_1 (tuple): coordinates of point 1
    @ coord_2 (tuple): coordinates of point 2
    @ draw_points (numpy.ndarray): image array to draw annotations
    Returns:
        float: distance between the two points
    '''
    if draw_points:
        # Color points
        cv2.circle(draw_points, coord_1, 10, (0,200,235), cv2.FILLED)
        cv2.circle(draw_points, coord_2, 10, (0,200,235), cv2.FILLED)
        # Create distance line
        cv2.line(draw_points, coord_1, coord_2, (198,138,9), 2) 
    
    x_1, y_1 = coord_1
    x_2, y_2 = coord_2
    
    # Calculate distance with hypotenuse
    return hypot(x_2 - x_1, y_2 - y_1)

# Webcam Test

In [21]:
# Webcam input
cam = cv2.VideoCapture(0)


# Create mediapipe Hands object
hands = mp_hands.Hands(model_complexity=0, min_detection_confidence=0.8, min_tracking_confidence=0.9)

# Run webcam
while cam.isOpened():
    
    # Read image frame
    success, image = cam.read()
    if not success:
        print("Ignoring empty camera frame.")
        continue
        
    # Improve performance = False
    image.flags.writeable = True
    
    # Flip the image horizontally for a selfie-view display.
    image = cv2.flip(image, 1)
    
    # Get image dimensions
    h, w, _ = image.shape
    
    # Process image in RGB color scale
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    
    # Create hand map
    hand_map = {}
    
    # Compute data for any Hand(s) found
    if results.multi_hand_landmarks:
        for index, hand_landmarks in enumerate(results.multi_hand_landmarks):
            
            # Draw the hand points and connections on the image
            draw_landmarks(image, hand_landmarks, HAND_CONNECTIONS)
            
            # Get current hand points coordinates
            current_hand = get_current_hand(results.multi_handedness[index])
            hand_position = find_hand_position(image, hand_landmarks)
            hand_map[current_hand] = map_hand_coord(hand_position)

            # Change volume
#             if 'Left' in hand_map.keys():
#                 # Select fingers
#                 finger_1 = hand_map['Left']['THUMB_TIP']
#                 finger_2 = hand_map['Left']['INDEX_FINGER_TIP']
#                 # Calculate distante and draw in frame
#                 finger_distance = get_points_distance(finger_1, finger_2, draw_points=True)
#                 # Bar position
#                 tx, ty = int(w*0.05), int(h*0.15) # top    x, y coord
#                 bx, by = int(w*0.08), int(h*0.35) # bottom x, y coord
#                 hand_screen_range = [int(h*0.05), int(h*0.40)] # 3 - 40 % of screen height
#                 volume_range = [0, 100]  # System volume range
#                 bar_range = [by, ty]     # Bar full range in screen
#                 # True value for volume level
#                 volume_level = np.interp(finger_distance, hand_screen_range, volume_range)
#                 bar_level = np.interp(volume_level, volume_range, bar_range)
                
#                 # Draw volume bar
#                 ## Empty bar (0,200,235) (198,138,9)
#                 cv2.rectangle(image, (tx, ty), (bx, by),
#                               (20, 20, 20), -1)
#                 ## Filling bar
#                 cv2.rectangle(image,
#                               (tx, int(bar_level)),
#                               (bx, by),
#                               (240, 240, 240), cv2.FILLED)
#                 ## Volume text
#                 cv2.putText(image,
#                             f'{int(volume_level)}%', 
#                             (tx-10, ty-10), 
#                             cv2.FONT_ITALIC,1 , (240, 240, 240), 2)
            
#                 # Change system volume level
#                 set_system_volume(volume_level)
            
    # Show video
    cv2.imshow('MediaPipe Hands', image)
    # Use ESC key to close webcam
    if cv2.waitKey(5) & 0xFF == 27:
        break

# Release video capture
cam.release()
# Memory dump
cv2.destroyAllWindows()
# fix window not closing bug on macOS 10.15
cv2.waitKey(1)

-1

In [41]:
h, w, _ = image.shape
h, w

(720, 1280)

In [74]:
# landmark for left hand
hand_connections = mp.solutions.holistic.HAND_CONNECTIONS

landmark_test = results.multi_hand_landmarks[0].landmark
height, width, channel = image.shape 

landmark_results = []
    
for _id, landmark in enumerate(landmark_test):
    x_coord, y_coord = int(landmark.x * width), int(landmark.y * height)
    landmark_results.append([_id, x_coord, y_coord])

def get_angle_between_vectors(u, v):
    dot_product = np.dot(u, v)
    norm = np.linalg.norm(u) * np.linalg.norm(v)
    return np.arccos(dot_product / norm)


angles_list = []
for connection_from in landmark_results:
    for connection_to in landmark_results:
        angle = get_angle_between_vectors(connection_from, connection_to)
        # If the angle is not null we store it else we store 0
        if angle:
            angles_list.append(angle)
        else:
            angles_list.append(0)

len(angles_list)
#np.array(landmark_test).reshape((21, 3))

  return np.arccos(dot_product / norm)


441

## Hand Model

In [15]:
class HandModel(object):
    """
    Params
        landmarks: List of positions
    Args
        connections: List of tuples containing the ids of the two landmarks representing a connection
        feature_vector: list of length 21*21=441 containing the angles between all connections
    """
    def __init__(self, landmarks: List[float]):
        self.connections = mp.solutions.holistic.HAND_CONNECTIONS

        landmarks = np.array(landmarks).reshape((21, 3))
        self.feature_vector = self._get_feature_vector(landmarks)

    def _get_feature_vector(self, landmarks: np.ndarray) -> List[float]:
        """
        Params
            landmarks: numpy array of shape (21, 3)
        Return
            List of length nb_connections * nb_connections containing all the angles between the connections
        """
        connections = self._get_connections_from_landmarks(landmarks)

        angles_list = []
        for connection_from in connections:
            for connection_to in connections:
                angle = self._get_angle_between_vectors(connection_from, connection_to)
                # If the angle is not null we store it else we store 0
                if angle == angle:
                    angles_list.append(angle)
                else:
                    angles_list.append(0)
        return angles_list

    def _get_connections_from_landmarks(self, landmarks: np.ndarray) -> List[np.ndarray]:
        """
        Params
            landmarks: numpy array of shape (21, 3)
        Return
            List of vectors representing hand connections
        """
        return list(
            map(
                lambda t: landmarks[t[1]] - landmarks[t[0]],
                self.connections,
            )
        )

    @staticmethod
    def _get_angle_between_vectors(u: np.ndarray, v: np.ndarray) -> float:
        """
        Params
            u, v: 3D vectors representing two connections
        Return
            Angle between the two vectors
        """
        dot_product = np.dot(u, v)
        norm = np.linalg.norm(u) * np.linalg.norm(v)
        return np.arccos(dot_product / norm)

mediapipe.python.solution_base.SolutionOutputs

### **Sexto passo**  
Adicionando o FPS (frames per second)

In [None]:
cap = cv2.VideoCapture(0) # Mudar o número para corresponder a camera (em geral é 0 ou 1, mas pode ser outro inteiro)

mp_hands = mp.solutions.hands

# instanciando a classe
hands = mp_hands.Hands() # equivalente a mp.solutions.hands.Hands()

# Utilizando o drawing_utils para desenhar as mãos
mp_draw = mp.solutions.drawing_utils
DrawingSpec = mp.solutions.drawing_utils.DrawingSpec

tempo_anterior = 0
tempo_corrente = 0

while True:
    sucesso, img = cap.read()
    
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)
    
    tempo_corrente = time.time()
    fps = 1 / (tempo_corrente - tempo_anterior)
    
    tempo_anterior = tempo_corrente
    
    if (results.multi_hand_landmarks):
        for hand_landmark in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(
                img,
                hand_landmark,
                mp_hands.HAND_CONNECTIONS)
    
    cv2.putText(img, # Onde colocar
               str(int(fps)), # convertendo pra string o fps
                (10, 70), # posição na imagem
                cv2.FONT_HERSHEY_PLAIN, # a fonte
                3, # tamanho
                (255, 0, 255), # cor
                5 # grossura
               )
    cv2.imshow("Imagem", img)
    cv2.waitKey(1)

### **Setimo passo**  
Ok! Conseguimos detectar as mãos e sabemos colocar as conecções entre os landmarks.  
Porém para conseguir desenvolver soluções mais complexas, precisamos descobrir qual a posição de cada landmark.
Sabemos que o `results.multi_hand_landmarks)` retorna uma lista!  
Cada elemento da lista é uma mão! E cada posição da lista refere-se ao landmark.  
Então será podemos criar um for loop para tanto desenhar a mão, como extrair os resultados?  

In [None]:
cap = cv2.VideoCapture(0) # Mudar o número para corresponder a camera (em geral é 0 ou 1, mas pode ser outro inteiro)

mp_hands = mp.solutions.hands

# instanciando a classe
hands = mp_hands.Hands() # equivalente a mp.solutions.hands.Hands()

# Utilizando o drawing_utils para desenhar as mãos
mp_draw = mp.solutions.drawing_utils
DrawingSpec = mp.solutions.drawing_utils.DrawingSpec

tempo_anterior = 0
tempo_corrente = 0

while True:
    sucesso, img = cap.read()
    
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)
    
    tempo_corrente = time.time()
    fps = 1 / (tempo_corrente - tempo_anterior)
    
    tempo_anterior = tempo_corrente
    
    if (results.multi_hand_landmarks):
        for hand_number, hand_landmark in enumerate(results.multi_hand_landmarks):
            for _id, landmark in enumerate(hand_landmark.landmark):
                print(hand_number, _id, landmark)
            
            
            
            mp_draw.draw_landmarks(
                img,
                hand_landmark,
                mp_hands.HAND_CONNECTIONS)
    
    cv2.putText(img, # Onde colocar
               str(int(fps)), # convertendo pra string o fps
                (10, 70), # posição na imagem
                cv2.FONT_HERSHEY_PLAIN, # a fonte
                3, # tamanho
                (255, 0, 255), # cor
                5 # grossura
               )
    cv2.imshow("Imagem", img)
    cv2.waitKey(1)