In [1]:
actions = [
    "aleff",
    "bb",
    "taa",
    "thaa",
    "jeem"
]
len(actions)

5

In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

h, w = 128, 128
mean = [0.43216, 0.394666, 0.37645]
std = [0.22803, 0.22145, 0.216989]

transformer = transforms.Compose([
    transforms.Resize((w, h)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

testing_model = torchvision.models.vgg16(pretrained=False)
num_ftrs = testing_model.classifier[-1].in_features
testing_model.classifier[-1] = nn.Linear(num_ftrs, len(actions))

best_weights = torch.load("semi-final/weights_img_c5_v1.pth")
testing_model.load_state_dict(best_weights)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
import mediapipe as mp

class Predictor:
    def __init__(self, model, device, transformer):
        self.model = model
        self.device = device
        self.transformer = transformer
    
    def predict(self, image):
        image = self.transformer(image)
        image = torch.stack([image])

        with torch.no_grad():
            self.model.eval()
            res = self.model(image)
            best_res = res.argmax()
        
        return best_res.item()

class Utils:
    hands = mp.solutions.hands.Hands(static_image_mode=False, min_detection_confidence=0.7, min_tracking_confidence=0.7, max_num_hands=1)
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils

    @staticmethod
    def get_bbox(img, color_type):
        image = img.copy()
        if color_type.lower() == "bgr":
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = Utils.hands.process(image)
        image.flags.writeable = True
        Utils.draw_styled_landmarks(image, results)

        if results.multi_hand_landmarks is None:
            return image, (0, 0, 0, 0)
        
        h, w, c = image.shape
        x_max, x_min, y_max, y_min = 0, w, 0, h
        hand_landmarks = results.multi_hand_landmarks[0]
        for lm in hand_landmarks.landmark:
            x, y = int(lm.x * w), int(lm.y * h)
            if x > x_max:
                x_max = x
            if x < x_min:
                x_min = x
            if y > y_max:
                y_max = y
            if y < y_min:
                y_min = y

        if x_max + 20 < w:
            x_max += 20
        else:
            x_max = w
        
        if y_max + 20 < h:
            y_max += 20
        else:
            y_max = h
        
        if x_min - 20 > 0:
            x_min -= 20
        else:
            x_min = 0
        
        if y_min - 20 > 0:
            y_min -= 20
        else:
            y_min = 0

        return image, (x_min, y_min, x_max, y_max) # left, top, right, bot

    @staticmethod
    def crop_hand(image, bbox, size=(512, 512)):
        segmented_image = np.zeros_like(image)
        if bbox != (0, 0, 0, 0):
            left, top, right, bot = bbox
            segmented_image[top:bot, left:right] = image[top:bot, left:right]
            image = image[top:bot, left:right]
        image = cv2.resize(image, size)
        return image, segmented_image
    
    @staticmethod
    def draw_styled_landmarks(image, results):
        # Draw right hand connections
        if results.multi_hand_landmarks != None:
            for handLandmarks in results.multi_hand_landmarks:
                Utils.mp_drawing.draw_landmarks(image, handLandmarks, Utils.mp_hands.HAND_CONNECTIONS)

predictor = Predictor(testing_model, device, transformer)

## **Normal Real-time**

In [32]:
import numpy as np
from PIL import Image
import cv2

sentence = []
predictions = []

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    drawed_landmarks, bbox = Utils.get_bbox(frame, "rgb")
    # print(bbox)
    cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
    if bbox != (0, 0, 0, 0):
        sign_idx = predictor.predict(Image.fromarray(cropped_img))
        predictions.append(sign_idx)
        predictions = predictions[-16:]
        print(predictions)
        if np.unique(predictions[-2:])[0] == sign_idx:
            if len(sentence) > 0 and actions[sign_idx] != sentence[-1]:
                sentence.append(actions[sign_idx])
        else:
            sentence.append(actions[sign_idx])
        sentence = sentence[-5:]
    
    cv2.rectangle(drawed_landmarks, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(drawed_landmarks, ' '.join(sentence), (3,30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    cv2.imshow("Segmentation", segmented_image)
    cv2.imshow("Landmarks", drawed_landmarks)
    cv2.imshow("Hand", cropped_img)

    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

[1]
[1, 1]
[1, 1, 1]
[1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 2]
[1, 1, 1, 1, 1, 1, 2, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1]
[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]
[1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
[1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]
[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
[1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [28]:
cap.release()
cv2.destroyAllWindows()

## **Final Real-time**

### **Make decession while processing**

In [152]:
import numpy as np
from PIL import Image
import cv2

sentence = []
predictions = []
predictions_count = dict()
counter = 0

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    drawed_landmarks, bbox = Utils.get_bbox(frame, "rgb")
    cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
    if bbox != (0, 0, 0, 0):
        sign_idx = predictor.predict(Image.fromarray(cropped_img))
        if sign_idx in predictions_count:
            predictions_count[sign_idx] += 1
        else:
            predictions_count[sign_idx] = 0
        counter += 1
    
    if counter == 16:
        counter = 0
        if len(predictions_count) > 0:
                sign_idx = max(predictions_count.items(), key=operator.itemgetter(1))[0]
                predictions.append(sign_idx)
                predictions = predictions[-16:]
        if len(predictions_count) > 0 and np.unique(predictions[-2:])[0] == sign_idx:
            if len(sentence) > 0 and actions[sign_idx] != sentence[-1]:
                sentence.append(actions[sign_idx])
            else:
                sentence.append(actions[sign_idx])
        elif len(predictions_count) > 0:
            sentence.append(actions[sign_idx])
        sentence = sentence[-5:]
        print(predictions_count)
        predictions_count.clear()
    
    
    cv2.rectangle(drawed_landmarks, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(drawed_landmarks, ' '.join(sentence), (3,30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    cv2.putText(drawed_landmarks, str(counter), (0, 85+1*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
    cv2.imshow("Segmentation", segmented_image)
    cv2.imshow("Landmarks", drawed_landmarks)
    cv2.imshow("Hand", cropped_img)

    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

{3: 1, 2: 13}
{2: 4, 3: 10}
{3: 3, 2: 11}
{2: 11, 1: 3}
{1: 0, 2: 1, 4: 0, 0: 11}
{0: 15}
{0: 3, 4: 0, 2: 10}
{2: 3, 3: 11}
{3: 15}
{3: 3, 2: 11}
{2: 2, 4: 11, 3: 0}
{1: 10, 4: 1, 3: 2}


### **Using simple counter** -> Held "q" to close the windows

In [142]:
import numpy as np
from PIL import Image
import cv2

sentence = []
predictions = []
waiting_counter = 0

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    drawed_landmarks, bbox = Utils.get_bbox(frame, "rgb")
    cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
    if bbox != (0, 0, 0, 0):
        sign_idx = predictor.predict(Image.fromarray(cropped_img))
        predictions.append(sign_idx)
        predictions = predictions[-16:]
        print(predictions)
        if np.unique(predictions[-2:])[0] == sign_idx:
            if len(sentence) > 0 and actions[sign_idx] != sentence[-1]:
                sentence.append(actions[sign_idx])
            else:
                sentence.append(actions[sign_idx])
        else:
            sentence.append(actions[sign_idx])
        sentence = sentence[-5:]
    
    cv2.rectangle(drawed_landmarks, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(drawed_landmarks, ' '.join(sentence), (3,30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    cv2.imshow("Segmentation", segmented_image)
    cv2.imshow("Landmarks", drawed_landmarks)
    cv2.imshow("Hand", cropped_img)

    while waiting_counter < 30:
        ret, frame = cap.read()
        drawed_landmarks, bbox = Utils.get_bbox(frame, "rgb")
        cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
        cv2.putText(drawed_landmarks, str(waiting_counter), (0, 85+1*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
        cv2.rectangle(drawed_landmarks, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(drawed_landmarks, ' '.join(sentence), (3,30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        waiting_counter += 1

        cv2.imshow("Landmarks", drawed_landmarks)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    waiting_counter = 0

    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

[1]
[1, 1]
[1, 1, 2]
[1, 1, 2, 2]
[1, 1, 2, 2, 1]
[1, 1, 2, 2, 1, 2]
[1, 1, 2, 2, 1, 2, 2]
[1, 1, 2, 2, 1, 2, 2, 2]
[1, 1, 2, 2, 1, 2, 2, 2, 3]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 3]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 3, 0]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 0]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 0, 1]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 0, 1, 2]
[1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 0, 1, 2, 3]
[1, 2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 0, 1, 2, 3, 2]
[2, 2, 1, 2, 2, 2, 3, 2, 3, 0, 0, 1, 2, 3, 2, 4]
[2, 1, 2, 2, 2, 3, 2, 3, 0, 0, 1, 2, 3, 2, 4, 4]


In [132]:
cap.release()
cv2.destroyAllWindows()

### **With threads**

In [128]:
import numpy as np
from PIL import Image
import cv2
import operator
import threading
import time


def predict_task(frame, data):
    drawed_landmarks, bbox = Utils.get_bbox(frame, "rgb")
    cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
    if bbox != (0, 0, 0, 0):
        sign_idx = predictor.predict(Image.fromarray(cropped_img))
    else:
        sign_idx = -1
    data.append(sign_idx)
    data.append(drawed_landmarks)
    data.append(cropped_img)
    data.append(segmented_image)

sentence = []
predictions = []
sequence = []
predictions_count = dict()

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    sequence.append(frame)
    if len(sequence) == 4:
        t1_data = t2_data = t3_data = t4_data = []
        t1 = threading.Thread(target=predict_task, args=(sequence[0], t1_data))
        t2 = threading.Thread(target=predict_task, args=(sequence[1], t2_data))
        t3 = threading.Thread(target=predict_task, args=(sequence[2], t3_data))
        t4 = threading.Thread(target=predict_task, args=(sequence[3], t4_data))

        t1.start()
        t2.start()
        t3.start()
        t4.start()

        # t1.join()
        # t2.join()
        # t3.join()
        # t4.join()

        while len(t1_data) == 0 and len(t2_data) == 0 and len(t3_data) == 0 and len(t4_data) == 0:
            time.sleep(0.1)
        # print(len(t1_data), len(t2_data), len(t3_data), len(t4_data))
        # time.sleep(0.3)
        signs_idx = [t1_data[0], t2_data[0], t3_data[0], t4_data[0]]

        for sign_idx in signs_idx:
            if sign_idx != -1 and sign_idx in predictions_count:
                predictions_count[sign_idx] += 1
            elif sign_idx != -1:
                predictions_count[sign_idx] = 0

        if len(predictions_count) > 0:
            sign_idx = max(predictions_count.items(), key=operator.itemgetter(1))[0]
            predictions.append(sign_idx)
            predictions = predictions[-16:]
        
        if len(predictions_count) > 0 and np.unique(predictions[-2:])[0] == sign_idx:
            if len(sentence) > 0 and actions[sign_idx] != sentence[-1]:
                sentence.append(actions[sign_idx])
            else:
                sentence.append(actions[sign_idx])
        elif len(predictions_count) > 0:
            sentence.append(actions[sign_idx])
        sentence = sentence[-5:]
        print(predictions_count)
        predictions_count.clear()
        sequence.clear()

        # cv2.imshow("Hand", cropped_img)
    else:
        segmented_image = drawed_landmarks = cropped_img = frame

    cv2.rectangle(frame, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(frame, ' '.join(sentence), (3,30), 
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    # cv2.imshow("Segmentation", segmented_image)
    cv2.imshow("Landmarks", frame)

    if cv2.waitKey(100) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

{2: 3}
{2: 3}
{2: 3}
{1: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{1: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{3: 3}
{3: 3}
{3: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{3: 3}
{3: 3}
{3: 3}
{3: 3}
{4: 3}
{4: 3}
{4: 3}
{3: 3}
{3: 3}
{3: 3}
{2: 3}
{2: 3}
{2: 3}
{1: 3}
{1: 3}
{2: 3}
{2: 3}
{2: 3}
{1: 3}
{1: 3}
{2: 3}
{2: 3}
{3: 3}
{4: 3}
{3: 3}
{3: 3}
{3: 3}
{3: 3}
{3: 3}


In [106]:
cap.release()
cv2.destroyAllWindows()

### **Without threads**

In [129]:
import numpy as np
from PIL import Image
import cv2
import operator

sentence = []
predictions = []
sequence = []
predictions_count = dict()

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    sequence.append(frame)
    if len(sequence) == 4:
        for frame in sequence:
            drawed_landmarks, bbox = Utils.get_bbox(frame, "rgb")
            # print(bbox)
            cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
            if bbox != (0, 0, 0, 0):
                sign_idx = predictor.predict(Image.fromarray(cropped_img))
                if sign_idx in predictions_count:
                    predictions_count[sign_idx] += 1
                else:
                    predictions_count[sign_idx] = 0
        if len(predictions_count) > 0:
            sign_idx = max(predictions_count.items(), key=operator.itemgetter(1))[0]
            predictions.append(sign_idx)
            predictions = predictions[-16:]
        
        if len(predictions_count) > 0 and np.unique(predictions[-2:])[0] == sign_idx:
            if len(sentence) > 0 and actions[sign_idx] != sentence[-1]:
                sentence.append(actions[sign_idx])
            else:
                sentence.append(actions[sign_idx])
        elif len(predictions_count) > 0:
            sentence.append(actions[sign_idx])
        sentence = sentence[-5:]
        print(predictions_count)
        predictions_count.clear()
        sequence.clear()

        # cv2.imshow("Hand", cropped_img)
    else:
        segmented_image = drawed_landmarks = cropped_img = frame

    cv2.rectangle(frame, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(frame, ' '.join(sentence), (3,30), 
            cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    # cv2.imshow("Segmentation", segmented_image)
    cv2.imshow("Landmarks", frame)

    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

{}
{}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{2: 3}
{2: 3}
{2: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 3}
{1: 2, 2: 0}
{2: 3}
{2: 3}
{3: 3}
{3: 3}
{3: 3}
{4: 3}
{4: 3}
{4: 3}
{4: 3}
{4: 3}
{4: 3}
{4: 3}
{4: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{3: 0, 2: 2}
{3: 2, 2: 0}
{1: 3}
{1: 2, 2: 0}
{2: 0, 1: 2}
{1: 2, 2: 0}
{1: 3}
{1: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 3}
{3: 1, 2: 1}
{3: 3}
{2: 3}
{2: 3}
{2: 3}
{2: 2, 3: 0}


In [49]:
cap.release()
cv2.destroyAllWindows()