<a href="https://colab.research.google.com/github/annabelle1217/computer-vision-mediapipe/blob/main/colab/finger_count_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install and Import Dependencies

In [1]:
!pip install mediapipe opencv-python 



In [2]:
from mediapipe import solutions as mp
import cv2
from google.protobuf.json_format import MessageToDict
from IPython.display import display, Javascript, Image
from google.colab.patches import cv2_imshow
from google.colab.output import eval_js
from PIL import Image
import numpy as np
import base64
import html
import time
import io
import os

### Finger Count Detector

In [3]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

# Helper Functions for video frame extraction and drawing landmarks
def js_to_image(js_reply):
  """
  Params:js_reply: JavaScript object containing image from webcam
  Returns:img: OpenCV BGR image
  """
  # Decode base64 image
  image_bytes = base64.b64decode(js_reply.split(',')[1])
  # Convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # Decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)
  return img

def drawing_array_to_bytes(drawing_array):
  """
  input: drawing_array: landmarks from holistic result
  output: drawing_bytes: string, encoded from drawing_array
  """
  drawing_PIL = Image.fromarray(drawing_array, 'RGB')
  iobuf = io.BytesIO()
  drawing_PIL.save(iobuf, format='png')
  drawing_bytes = 'data:image/png;base64,{}'.format((str(base64.b64encode(iobuf.getvalue()), 'utf-8')))
  return drawing_bytes
  
def video_frame(label, bytes):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bytes))
  return data

In [4]:
# Start streaming video from webcam
video_stream()
# Label for video
label_html = 'Finger Count Detection'

drawing_lm = ''
tip_id = [4, 8, 12, 16, 20]
pTime = 0

with mp.hands.Hands(max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:

    while True:
      js_reply = video_frame(label_html, drawing_lm)
      
      if not js_reply:
          break
        
      # Convert JS response to OpenCV Image
      image = js_to_image(js_reply["img"])

      # Recolor Feed
      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

      results = hands.process(image)

      lm_list = []

      if results.multi_hand_landmarks:
          for hand_landmark in results.multi_hand_landmarks:
              for idx, lm in enumerate(hand_landmark.landmark):
                  h, w, c = image.shape
                  cx, cy = int(lm.x*w), int(lm.y*h)
                  lm_list.append([idx, cx, cy])
                      
              mp.drawing_utils.draw_landmarks(
                  image, hand_landmark, mp.hands.HAND_CONNECTIONS)
      
      # Detect handedness
      if results.multi_handedness:
          for idx, hand_handedness in enumerate(results.multi_handedness):
              handedness_dict = MessageToDict(hand_handedness)    
      
      # Detect finger count
      total_fingers = 0
      if len(lm_list) != 0:
          fingers = []
          for id in tip_id:
              
              # Thumb
              if id == tip_id[0]:
                  # When using front camera, the handedness is mirrored
                  if results.multi_handedness[0].classification[0].label == "Left":
                      if lm_list[id][1] > lm_list[id-1][1]:
                          fingers.append(1)
                          continue
                      else:
                          fingers.append(0)
                          continue
                  else:

                      if lm_list[id][1] < lm_list[id-1][1]:
                          fingers.append(1)
                          continue
                      else:
                          fingers.append(0)
                          continue
              # 4 fingers    
              if lm_list[id][2] < lm_list[id-2][2]:
                  fingers.append(1)
              else:
                  fingers.append(0)

          total_fingers = fingers.count(1)
          
      # Display result
      cv2.rectangle(image, (20, 20), (140, 220), (0, 255, 0), cv2.FILLED)
      cv2.putText(image, str(total_fingers), (30, 175), cv2.FONT_HERSHEY_PLAIN, 
                                            10, (255, 0, 0), 25)
      
      # Display framerate
      cTime = time.time()
      fps = 1/(cTime-pTime)
      pTime = cTime
      cv2.putText(image, f"FPS: {int(fps)}", (500, 50), cv2.FONT_HERSHEY_PLAIN,
                  2, (255, 0, 0), 2)

      # Convert overlay of landmarks into bytes
      lm_bytes = drawing_array_to_bytes(image)
      
      # Update landmarks so next frame gets new overlay
      drawing_lm = lm_bytes

<IPython.core.display.Javascript object>

<!-- ![image](https://user-images.githubusercontent.com/60457367/123659592-715aba80-d865-11eb-9acd-a15b719c2fe9.png) -->