In [1]:
actions = [
    "أ",
    "ب",
    "ت",
    "ث",
    "ج"
]
len(actions)

5

In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

h, w = 128, 128
mean = [0.43216, 0.394666, 0.37645]
std = [0.22803, 0.22145, 0.216989]

transformer = transforms.Compose([
    transforms.Resize((w, h)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

testing_model = torchvision.models.vgg16(pretrained=False)
num_ftrs = testing_model.classifier[-1].in_features
testing_model.classifier[-1] = nn.Linear(num_ftrs, len(actions))

best_weights = torch.load("semi-final/weights_img_c5_v1.pth")
testing_model.load_state_dict(best_weights)

<All keys matched successfully>

In [5]:
import mediapipe as mp

class Predictor:
    def __init__(self, model, transformer):
        self.model = model
        self.transformer = transformer
    
    def predict(self, image):
        image = self.transformer(image)
        image = torch.stack([image])

        with torch.no_grad():
            self.model.eval()
            res = self.model(image)
            best_res = res.argmax()
        
        return best_res.item()

import cv2
import mediapipe as mp
import numpy as np

class Utils:
    hands = mp.solutions.hands.Hands(static_image_mode=False, min_detection_confidence=0.7, min_tracking_confidence=0.7, max_num_hands=1)
    mp_hands = mp.solutions.hands
    mp_drawing = mp.solutions.drawing_utils

    @staticmethod
    def get_bbox(img):
        image = img.copy()
        image  = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = Utils.hands.process(image)
        image.flags.writeable = True
        image  = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
        Utils.draw_styled_landmarks(image, results)

        if results.multi_hand_landmarks is None:
            return image, (0, 0, 0, 0)
        
        h, w, c = image.shape
        x_max, x_min, y_max, y_min = 0, w, 0, h
        hand_landmarks = results.multi_hand_landmarks[0]
        for lm in hand_landmarks.landmark:
            x, y = int(lm.x * w), int(lm.y * h)
            if x > x_max:
                x_max = x
            if x < x_min:
                x_min = x
            if y > y_max:
                y_max = y
            if y < y_min:
                y_min = y

        if x_max + 20 < w:
            x_max += 20
        else:
            x_max = w
        
        if y_max + 20 < h:
            y_max += 20
        else:
            y_max = h
        
        if x_min - 20 > 0:
            x_min -= 20
        else:
            x_min = 0
        
        if y_min - 20 > 0:
            y_min -= 20
        else:
            y_min = 0

        return image, (x_min, y_min, x_max, y_max) # left, top, right, bot

    @staticmethod
    def crop_hand(image, bbox, size=(512, 512)):
        segmented_image = np.zeros_like(image)
        if bbox != (0, 0, 0, 0):
            left, top, right, bot = bbox
            segmented_image[top:bot, left:right] = image[top:bot, left:right]
            image = image[top:bot, left:right]
        image = cv2.resize(image, size)
        return image, segmented_image
    
    @staticmethod
    def draw_styled_landmarks(image, results):
        # Draw right hand connections
        if results.multi_hand_landmarks != None:
            for handLandmarks in results.multi_hand_landmarks:
                Utils.mp_drawing.draw_landmarks(image, handLandmarks, Utils.mp_hands.HAND_CONNECTIONS)

## **Final Real-time**

### **Make decession while processing**

In [6]:
import numpy as np
from PIL import Image
import cv2
import operator
import arabic_reshaper
from bidi.algorithm import get_display

class LettersPredictor:
    def __init__(self, predictor, actions):
        self.predictor = predictor
        self.actions = actions
        self.letters = []
        self.predictions = []
        self.words = []
        self.predictions_count = dict()
        
    def predict(self, frame):
        drawed_landmarks, bbox = Utils.get_bbox(frame)
        cropped_img, segmented_image = Utils.crop_hand(frame, bbox)
        if bbox != (0, 0, 0, 0):
            sign_idx = self.predictor.predict(Image.fromarray(cropped_img))
            if sign_idx in self.predictions_count:
                self.predictions_count[sign_idx] += 1
            else:
                self.predictions_count[sign_idx] = 0
        
            return True, drawed_landmarks, cropped_img, segmented_image
        return False, drawed_landmarks, cropped_img, segmented_image
    
    def compine_letters(self):
        word = ""
        for letter in self.letters:
            word += letter
        if len(word) > 0:
            word = arabic_reshaper.reshape(word)
            word = get_display(word) 
            self.words.append(word)
            
        self.letters.clear()
        self.predictions_count.clear()
        return self.words, word
    
    def predict_letter(self):
        if len(self.predictions_count) > 0:
                sign_idx = max(self.predictions_count.items(), key=operator.itemgetter(1))[0]
                self.predictions.append(sign_idx)
                self.predictions = self.predictions[-16:]
        if len(self.predictions_count) > 0 and np.unique(self.predictions[-2:])[0] == sign_idx:
            if len(self.letters) > 0 and self.actions[sign_idx] != self.letters[-1]:
                self.letters.append(self.actions[sign_idx])
            else:
                self.letters.append(self.actions[sign_idx])
        elif len(self.predictions_count) > 0:
            self.letters.append(self.actions[sign_idx])        
        self.predictions_count.clear()
    

In [9]:
import numpy as np
from PIL import Image, ImageFont, ImageDraw
import cv2
import arabic_reshaper
from bidi.algorithm import get_display

predictor = Predictor(testing_model, transformer)
letters_predictor = LettersPredictor(predictor, actions)

fontpath = "arial.ttf" # <== https://www.freefontspro.com/14454/arial.ttf
font = ImageFont.truetype(fontpath, 32)
counter = 0
discarded_frames = 0
words = []

cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    flag, drawed_landmarks, cropped_img, segmented_image = letters_predictor.predict(frame)
    
    if flag:
        counter += 1
        discarded_frames = 0
    else:
        discarded_frames += 1

    if discarded_frames == 6:
        discarded_frames = 0
        counter = 0
        words, word = letters_predictor.compine_letters()

    if counter == 16:
        counter = 0
        letters_predictor.predict_letter()

    cv2.rectangle(drawed_landmarks, (0,0), (640, 40), (245, 117, 16), -1)

    img_pil = Image.fromarray(drawed_landmarks)
    draw = ImageDraw.Draw(img_pil)
    draw.text((0, 0), ' '.join(reversed(words)), font = font)
    drawed_landmarks = np.array(img_pil)
    
    cv2.putText(drawed_landmarks, str(counter), (0, 85+1*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
    cv2.putText(drawed_landmarks, str(discarded_frames), (0, 85+2*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
    cv2.imshow("Segmentation", segmented_image)
    cv2.imshow("Landmarks", drawed_landmarks)
    cv2.imshow("Hand", cropped_img)

    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

In [8]:
cap.release()
cv2.destroyAllWindows()