In [1]:
actions = [
    "أ",
    "ب",
    "ت",
    "ث",
    "ج"
]
len(actions)

5

In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

h, w = 128, 128
mean = [0.43216, 0.394666, 0.37645]
std = [0.22803, 0.22145, 0.216989]

transformer = transforms.Compose([
    transforms.Resize((w, h)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

testing_model = torchvision.models.vgg16(pretrained=False)
num_ftrs = testing_model.classifier[-1].in_features
testing_model.classifier[-1] = nn.Linear(num_ftrs, len(actions))

best_weights = torch.load("weights/weights_img_c5_v2.pth")
testing_model.load_state_dict(best_weights)

<All keys matched successfully>

In [3]:
class Predictor:
    def __init__(self, model, transformer):
        self.model = model
        self.transformer = transformer

    def predict(self, image):
        image = self.transformer(image)
        image = torch.stack([image])

        with torch.no_grad():
            self.model.eval()
            res = self.model(image)
            best_res = res.argmax()
        
        return best_res.item()

In [8]:
import numpy as np
from PIL import Image
import cv2
import operator
import arabic_reshaper
from bidi.algorithm import get_display
import mediapipe as mp

class LettersPredictor:
    def __init__(self, predictor, actions):
        self.predictor = predictor
        self.actions = actions
        self.letters = []
        self.predictions = []
        self.words = []
        self.predictions_count = dict()
        self.hands = mp.solutions.hands.Hands(static_image_mode=False, min_detection_confidence=0.7, min_tracking_confidence=0.7, max_num_hands=1)

    def predict(self, frame):
        if self.considered_frame(frame):
            sign_idx = self.predictor.predict(Image.fromarray(frame))
            if sign_idx in self.predictions_count:
                self.predictions_count[sign_idx] += 1
            else:
                self.predictions_count[sign_idx] = 0
            return True
        return False

    def considered_frame(self, frame):
        image = frame.copy()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = self.hands.process(image)
        image.flags.writeable = False

        if results.multi_hand_landmarks is None:
            return False
        return True
    
    def compine_letters(self):
        word = ""
        for letter in self.letters:
            word += letter
        if len(word) > 0:
            word = arabic_reshaper.reshape(word)
            word = get_display(word) 
            self.words.append(word)
            
        self.letters.clear()
        self.predictions_count.clear()
        return self.words, word
    
    def predict_letter(self):
        if len(self.predictions_count) > 0:
                sign_idx = max(self.predictions_count.items(), key=operator.itemgetter(1))[0]
                self.predictions.append(sign_idx)
                self.predictions = self.predictions[-16:]
        if len(self.predictions_count) > 0 and np.unique(self.predictions[-2:])[0] == sign_idx:
            if len(self.letters) > 0 and self.actions[sign_idx] != self.letters[-1]:
                self.letters.append(self.actions[sign_idx])
            else:
                self.letters.append(self.actions[sign_idx])
        elif len(self.predictions_count) > 0:
            self.letters.append(self.actions[sign_idx])        
        self.predictions_count.clear()
    

In [9]:
from PIL import Image, ImageFont, ImageDraw
import cv2
import arabic_reshaper
from bidi.algorithm import get_display

predictor = Predictor(testing_model, transformer)
letters_predictor = LettersPredictor(predictor, actions)

fontpath = "arial.ttf" # <== https://www.freefontspro.com/14454/arial.ttf
font = ImageFont.truetype(fontpath, 32)

counter = 0
discarded_frames = 0
words = []

cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    flag = letters_predictor.predict(frame)
    
    if flag:
        counter += 1
        discarded_frames = 0
    else:
        discarded_frames += 1

    if discarded_frames == 6:
        discarded_frames = 0
        counter = 0
        words, word = letters_predictor.compine_letters()
    
    if counter == 16:
        counter = 0
        letters_predictor.predict_letter()
    
    cv2.rectangle(frame, (0,0), (640, 40), (245, 117, 16), -1)

    img_pil = Image.fromarray(frame)
    draw = ImageDraw.Draw(img_pil)
    draw.text((0, 0), ' '.join(reversed(words)), font = font)
    frame = np.array(img_pil)
    
    cv2.putText(frame, "Predict One Letter: " + str(counter), (0, 85+1*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
    cv2.putText(frame, "Compine Letters: " + str(discarded_frames), (0, 85+2*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
    cv2.imshow("Frame", frame)

    if cv2.waitKey(50) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()