In [1]:
import os
import numpy as np
import pandas as pd
from utils import model_utils

from signlens.params import *
from signlens.preprocessing.data import load_data_subset_csv, load_glossary, load_video_list_json, load_landmarks_json, load_relevant_data_subset_per_landmark_type_from_json
from signlens.preprocessing.preprocess import group_pad_sequences, encode_labels, decode_labels, pad_and_preprocess_sequence, reshape_processed_data_to_tf
from utils.model_utils import load_model
from utils.video_utils import process_video_to_landmarks_json, draw_landmarks_on_image
from utils.plot_landmarks import plot_landmarks_2D, plot_interactive_landmark_frames_2D_from_dict

%load_ext autoreload

%autoreload 2


2024-03-24 17:01:43.171381: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
test_data = load_data_subset_csv(balanced=True, csv_path=TRAIN_TEST_CSV_PATH)

[34mLoading data subset from train_test.csv[0m
    ℹ️ Filtered sequences with missing frames. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_frames = 100. Size reduced from 17233 to 17233 (100.00%)
    ℹ️ Filtered on n_classes = 10. Size reduced from 17233 to 690 (4.00%)
    ℹ️ Balanced data, with average of 34.5 elements per class. Size reduced from 690 to 345 (50.00%)
✅ Loaded 345 rows (2.00% of the original 17233 rows) from the dataset.


In [3]:
test_data

Unnamed: 0,path,participant_id,sequence_id,sign,n_frames,n_frames2,file_path
0,train_landmark_files_noface/22343/1963578447.p...,22343,1963578447,book,27,27,/home/bfrisque/code/benoitfrisque/signlens/raw...
1,train_landmark_files_noface/34503/2196332453.p...,34503,2196332453,who,18,18,/home/bfrisque/code/benoitfrisque/signlens/raw...
2,train_landmark_files_noface/22343/2148840772.p...,22343,2148840772,all,25,25,/home/bfrisque/code/benoitfrisque/signlens/raw...
3,train_landmark_files_noface/36257/3595687867.p...,36257,3595687867,before,11,11,/home/bfrisque/code/benoitfrisque/signlens/raw...
4,train_landmark_files_noface/37055/1793144608.p...,37055,1793144608,yes,27,27,/home/bfrisque/code/benoitfrisque/signlens/raw...
...,...,...,...,...,...,...,...
340,train_landmark_files_noface/61333/1335890724.p...,61333,1335890724,fine,29,29,/home/bfrisque/code/benoitfrisque/signlens/raw...
341,train_landmark_files_noface/62590/1083816707.p...,62590,1083816707,go,16,16,/home/bfrisque/code/benoitfrisque/signlens/raw...
342,train_landmark_files_noface/62590/541074180.pa...,62590,541074180,book,18,18,/home/bfrisque/code/benoitfrisque/signlens/raw...
343,train_landmark_files_noface/16069/728399657.pa...,16069,728399657,go,7,7,/home/bfrisque/code/benoitfrisque/signlens/raw...


# Model predict

In [32]:
model_name = "model 20240322-173411"
model, _ = load_model(model_name)

processed_data = group_pad_sequences(test_data.file_path.iloc[0:2])

prediction = model.predict([processed_data])

word, proba = decode_labels(prediction)

word, proba


[34m
Load latest model from local registry...[0m
[34m
Load latest model from disk...[0m
✅ Model loaded from local disk /home/bfrisque/code/benoitfrisque/signlens/training_outputs/model 20240322-173411


  self.pid = os.fork()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step


(['who', 'all'], array([0.37138933, 0.16871023], dtype=float32))

# Video conversion to landmarks

In [5]:
videos = load_video_list_json()
videos

Unnamed: 0,sign,bbox,fps,frame_end,frame_start,instance_id,signer_id,source,split,url,variation_id,video_id,video_path
0,airplane,"[605, 21, 1721, 1076]",25,-1,1,10,4,signschool,train,https://signstock.blob.core.windows.net/signsc...,0,01726,/home/bfrisque/code/benoitfrisque/signlens/raw...
1,table,"[374, 52, 810, 720]",25,-1,1,13,38,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,0,56556,/home/bfrisque/code/benoitfrisque/signlens/raw...
2,see,"[85, 15, 230, 192]",25,-1,1,8,11,signingsavvy,train,https://www.signingsavvy.com/signs/mp4/8/8396.mp4,0,50125,/home/bfrisque/code/benoitfrisque/signlens/raw...
3,who,"[165, 4, 472, 370]",25,-1,1,14,88,aslsignbank,train,https://aslsignbank.haskins.yale.edu/dictionar...,0,66778,/home/bfrisque/code/benoitfrisque/signlens/raw...
4,dog,"[417, 61, 834, 720]",25,-1,1,7,38,startasl,train,https://s3-us-west-1.amazonaws.com/files.start...,1,17086,/home/bfrisque/code/benoitfrisque/signlens/raw...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1471,arm,"[205, 37, 489, 370]",25,-1,1,3,89,aslsignbank,val,https://aslsignbank.haskins.yale.edu/dictionar...,0,65094,/home/bfrisque/code/benoitfrisque/signlens/raw...
1472,say,"[104, 0, 528, 480]",25,-1,1,4,13,asldeafined,val,https://media.asldeafined.com/vocabulary/14687...,0,49430,/home/bfrisque/code/benoitfrisque/signlens/raw...
1473,ear,"[296, 36, 879, 720]",25,-1,1,1,118,aslbrick,train,http://aslbricks.org/New/ASL-Videos/ear.mp4,0,69306,/home/bfrisque/code/benoitfrisque/signlens/raw...
1474,closet,"[64, 14, 260, 240]",25,-1,1,6,26,spreadthesign,train,https://media.spreadthesign.com/video/mp4/13/3...,0,11284,/home/bfrisque/code/benoitfrisque/signlens/raw...


In [26]:
video_path = videos[videos.sign == 'book'].video_path.iloc[0]
video_path


'/home/bfrisque/code/benoitfrisque/signlens/raw_data/WLASL/videos/07070.mp4'

In [27]:
json_data = process_video_to_landmarks_json(video_path, json_output=True, save_annotated_video=True, show_preview=True, frame_interval=1)

✅ Landmarks saved to '/home/bfrisque/code/benoitfrisque/signlens/processed_data/landmarks_videos/07070_landmarks.json'
✅ Annotated video saved to '/home/bfrisque/code/benoitfrisque/signlens/processed_data/landmarks_videos/07070_annotated.mp4'


In [28]:
landmarks_json_path = '/home/bfrisque/code/benoitfrisque/signlens/processed_data/landmarks_videos/07070_landmarks.json'

landmarks = load_landmarks_json(landmarks_json_path)
data_processed = pad_and_preprocess_sequence(landmarks)
data_tf = reshape_processed_data_to_tf(data_processed)
data_tf.shape

TensorShape([1, 100, 225])

In [29]:
model_name = "model 20240322-173411"
model, _ = load_model(model_name)

prediction = model.predict(data_tf)

word, proba = decode_labels(prediction)

word, proba


[34m
Load latest model from local registry...[0m
[34m
Load latest model from disk...[0m
✅ Model loaded from local disk /home/bfrisque/code/benoitfrisque/signlens/training_outputs/model 20240322-173411
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step


(['who'], array([0.18926598], dtype=float32))

## Check landmarks from json

In [30]:
landmarks_dict = load_relevant_data_subset_per_landmark_type_from_json(landmarks_json_path)

In [31]:
plot_interactive_landmark_frames_2D_from_dict(landmarks_dict);

interactive(children=(SelectionSlider(description='Frame index', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…

## Annotate landmarks on video

### Previous version of mediapipe (used in this project)

In [20]:
json_data = process_video_to_landmarks_json(video_path, show_preview=True, save_annotated_video=True)

✅ Landmarks saved to '/home/bfrisque/code/benoitfrisque/signlens/processed_data/landmarks_videos/07074_landmarks.json'
✅ Annotated video saved to '/home/bfrisque/code/benoitfrisque/signlens/processed_data/landmarks_videos/07074_annotated.mp4'


### New version of mediapipe - hand (not used)

In [None]:
# !wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task

In [None]:
# #@markdown We implemented some functions to visualize the hand landmark detection results. <br/> Run the following cell to activate the functions.

# from mediapipe import solutions
# from mediapipe.framework.formats import landmark_pb2
# import numpy as np

# MARGIN = 10  # pixels
# FONT_SIZE = 1
# FONT_THICKNESS = 1
# HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

# def draw_landmarks_on_image_new(rgb_image, detection_result):
#   hand_landmarks_list = detection_result.hand_landmarks
#   handedness_list = detection_result.handedness
#   annotated_image = np.copy(rgb_image)

#   # Loop through the detected hands to visualize.
#   for idx in range(len(hand_landmarks_list)):
#     hand_landmarks = hand_landmarks_list[idx]
#     handedness = handedness_list[idx]

#     # Draw the hand landmarks.
#     hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
#     hand_landmarks_proto.landmark.extend([
#       landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
#     ])
    
#     solutions.drawing_utils.draw_landmarks(
#       annotated_image,
#       hand_landmarks_proto,
#       solutions.hands.HAND_CONNECTIONS,
#       solutions.drawing_styles.get_default_hand_landmarks_style(),
#       solutions.drawing_styles.get_default_hand_connections_style())

#     # Get the top left corner of the detected hand's bounding box.
#     height, width, _ = annotated_image.shape
#     x_coordinates = [landmark.x for landmark in hand_landmarks]
#     y_coordinates = [landmark.y for landmark in hand_landmarks]
#     text_x = int(min(x_coordinates) * width)
#     text_y = int(min(y_coordinates) * height) - MARGIN

#     # Draw handedness (left or right hand) on the image.
#     cv2.putText(annotated_image, f"{handedness[0].category_name}",
#                 (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
#                 FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

#   return annotated_image

In [None]:
# import cv2
# import numpy as np
# import mediapipe as mp
# from mediapipe import solutions
# from mediapipe.framework.formats import landmark_pb2
# from mediapipe.tasks import python
# from mediapipe.tasks.python import vision

# base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
# running_mode =  vision.RunningMode('VIDEO')
# options = vision.HandLandmarkerOptions(base_options=base_options,
#                                        num_hands=2)
# detector = vision.HandLandmarker.create_from_options(options)

# cap = cv2.VideoCapture(video_path)

# # Load the frame rate of the video using OpenCV's CAP_PROP_FPS
# fps = cap.get(cv2.CAP_PROP_FPS)

# while cap.isOpened():
#     ret, frame = cap.read()
#     if not ret:
#         break
    
#     # Convert the frame to RGB
#     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
#     # Convert the frame to MediaPipe's Image object
#     image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
    
#     # Get the timestamp for the current frame
#     frame_timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)

#         # STEP 4: Detect hand landmarks from the input image.
#     detection_result = detector.detect(image)

#     # # STEP 5: Process the classification result. In this case, visualize it.
#     annotated_image = draw_landmarks_on_image_new(image.numpy_view(), detection_result)
#     annotated_image_color = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)

#     cv2.imshow('Annotated Image', annotated_image_color)
   
    
#     if cv2.waitKey(1) & 0xFF == ord('q'):
#          break

# # Release the video capture object and close all windows
# cap.release()
# cv2.destroyAllWindows()
