In [2]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import os
import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2
import numpy as np
import torchvision
import torch
import torch.nn as nn

In [5]:
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

I0000 00:00:1733370641.704062  557511 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1733370641.731734  559564 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733370641.752965  559568 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [19]:
finger_points = [4, 8, 12, 16, 20]

In [20]:
model = torchvision.models.vit_b_32(weights=torchvision.models.ViT_B_32_Weights.DEFAULT)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.heads = nn.Sequential(nn.Linear(768, 256), 
                            nn.GELU(),
                            nn.BatchNorm1d(256),
                            nn.Linear(256, 1))
model = model.to(device)

In [21]:
model.load_state_dict(torch.load("touch_vit_models/model_weights2.1_complete.pth", map_location=torch.device('cpu')))

<All keys matched successfully>

In [8]:
vit_transforms = torchvision.models.ViT_B_32_Weights.IMAGENET1K_V1.transforms()

In [6]:
def locate_index(img):
    detection_result = detector.detect(img)
    x = detection_result.hand_landmarks[0][8].x * 1920
    y = detection_result.hand_landmarks[0][8].y * 1080
    coords = np.array((x, y)).astype(int)
    x_min, y_min = coords - 20 - 80
    x_max, y_max = coords + 20 + 80
    x_min, y_min = max(x_min,0), max(y_min,0)
    x_max, y_max = min(x_max, 1920) , min(y_max, 1080)

    return x_min, x_max, y_min, y_max

In [9]:
def process_img(img, transforms=vit_transforms):
    return transforms(torch.tensor(img.transpose((2, 0, 1))))[np.newaxis]

In [10]:
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'))
cap.set(cv2.CAP_PROP_FPS, 30)

True

In [11]:
window_name = "Touch prediction"
disp_x = 480 * 2
disp_y = 384 * 2
aspect_ratio = 480 / 384
disp_frame = np.zeros((disp_y, disp_x, 3), dtype=np.uint8)
cv2.imshow(window_name, disp_frame)

#model.eval()

while True:
    ret, camera_frame = cap.read()
    if camera_frame is None:
        continue
    
    rgb_frame = cv2.cvtColor(camera_frame, cv2.COLOR_BGR2RGB)
    img = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)

    try:
        x_min, x_max, y_min, y_max = locate_index(img)

        input = process_img(camera_frame[y_min:y_max, x_min:x_max])
        output = 1#model(input)
        pred = torch.round(torch.sigmoid(output))

        cv2.rectangle(camera_frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)

        text = "Touch" if pred == 1 else "No Contact"
        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
        text_x = x_min
        text_y = y_min - 10  # Position text above the bounding box
        cv2.rectangle(
            camera_frame,
            (text_x, text_y - text_size[1] - 4),  # Text background
            (text_x + text_size[0] + 4, text_y + 4),
            (0, 0, 255),
            -1,
        )
        cv2.putText(
            camera_frame,
            text,
            (text_x, text_y),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6,
            (255, 255, 255),
            2,
        )
    except:
        pass

    cv2.imshow(window_name, camera_frame)
    keycode = cv2.waitKey(1) & 0xFF

    if keycode == ord('q'):
        break    

cv2.destroyAllWindows()



In [31]:
cv2.destroyAllWindows()

In [22]:
for i in range(29):
    camera_frame = cv2.imread(f"/Users/brianchen/Research/grasp-detection/touch_dataset_new/no_contact/sample_0_{i}.jpeg")
    rgb_frame = cv2.cvtColor(camera_frame, cv2.COLOR_BGR2RGB)
    img = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
    out = detector.detect(img)
    if len(out.hand_landmarks) > 1:
        break

In [24]:
out

HandLandmarkerResult(handedness=[[Category(index=1, score=0.981307864189148, display_name='Left', category_name='Left')], [Category(index=0, score=0.9776174426078796, display_name='Right', category_name='Right')]], hand_landmarks=[[NormalizedLandmark(x=0.3320728540420532, y=0.25143375992774963, z=7.14436438897792e-08, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.3079882860183716, y=0.4028165638446808, z=-0.03159911185503006, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.25680476427078247, y=0.5206488370895386, z=-0.046158984303474426, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.20127686858177185, y=0.5825384855270386, z=-0.06043625995516777, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.14470000565052032, y=0.6261320114135742, z=-0.07364130765199661, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.15390534698963165, y=0.4507398009300232, z=-0.00015102715406101197, visibility=0.0, presence=0.0), NormalizedLandmark(x=0.0877712294459343, y=0.512856