In [1]:
actions = [
    "aleff",
    "bb",
    "taa",
    "thaa",
]
len(actions)

4

In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

h, w = 128, 128
mean = [0.43216, 0.394666, 0.37645]
std = [0.22803, 0.22145, 0.216989]

transformer = transforms.Compose([
    transforms.Resize((w, h)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

# testing_model = torchvision.models.vgg16(pretrained=False)
# num_ftrs = testing_model.classifier[-1].in_features
# testing_model.classifier[-1] = nn.Linear(num_ftrs, 4)

# best_checkpoint = torch.load("weights/weights_img_c4_v1.tar")
# testing_model.load_state_dict(best_checkpoint["model_state_dict"])


testing_model = torchvision.models.vgg16(pretrained=False)
num_ftrs = testing_model.classifier[-1].in_features
testing_model.classifier[-1] = nn.Linear(num_ftrs, 4)

best_weights = torch.load("semi-final/weights_img_c4_v1.pth")
testing_model.load_state_dict(best_weights)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from detecto.core import Model
import numpy as np
import cv2

class Predictor:
    def __init__(self, model, device, transformer):
        self.model = model.to(device)
        self.device = device
        self.transformer = transformer

    def predict(self, image):
        image = self.transformer(image)
        image = torch.stack([image]).to(device)

        with torch.no_grad():
            self.model.eval()
            res = self.model(image)
            best_res = res.argmax()

            return best_res.item()

class HandDetector:
    def __init__(self, path_to_weights, classes):
        self.model = Model(classes)
        self.model = self.model.load(path_to_weights, classes)

    def __predict(self, image):
        predictions = self.model.predict_top(image)
        labels, boxes, scores = predictions[0], predictions[1], predictions[2]

        if len(scores) > 0:
            best_pred = scores.argmax()
            label = labels[best_pred]
            box = boxes[best_pred]
            score = scores[best_pred]
            return label, box.round().int().numpy(), score.numpy()
        else:
            return "", np.zeros((0)), None
    
    def crop_hand(self, image, size=(512, 512)):
        _, bbox, _ = self.__predict(image)
        if len(bbox) == 0:
            return None
        left, top, right, bot = bbox
        image = image[top:bot, left:right]
        image = cv2.resize(image, size)
        return image

    def draw_bbox(self, image, color=(255, 0, 0), thickness=1):
        _, bbox, _ = self.__predict(image)
        if len(bbox) == 0:
            return None
        top_left, bot_right = (bbox[0], bbox[1]), (bbox[2], bbox[3])
        return cv2.rectangle(image, top_left, bot_right, color, thickness)
    
    def predict(self, image, color=(255, 0, 0), thickness=1):
        label, bbox, score = self.__predict(image)
        if len(bbox) == 0:
            return None
        top_left, bot_right = (bbox[0], bbox[1]), (bbox[2], bbox[3])
        cv2.putText(image, label + " " + str(np.round(score*100, 2))+"%", (top_left[0], top_left[1]), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 250, 150), 2, cv2.LINE_8)
        return cv2.rectangle(image, top_left, bot_right, color, thickness)



colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        prob = max(0,prob)
        # cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num%3], -1)
        # cv2.putText(output_frame, str(actions[num]), (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (173,27,54), 2, cv2.LINE_AA)
        
    return output_frame

hand_detector = HandDetector("weights\\weights_hand_detection_c4_v2.pth", classes=["aleff", "bb", "taa", "thaa"])
predictor = Predictor(testing_model, device, transformer)

In [6]:
from PIL import Image
import cv2

sentence = []
predictions = []

cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.resize(frame, (640, 480))
    cropped_image = hand_detector.crop_hand(frame)
    if cropped_image is None:
        cropped_image = frame
    else:
        sign_idx = predictor.predict(Image.fromarray(cropped_image))
        predictions.append(sign_idx)
        predictions = predictions[-16:]
        print(predictions)
        if np.unique(predictions[-2:])[0] == sign_idx:
            if len(sentence) > 0 and actions[sign_idx] != sentence[-1]:
                sentence.append(actions[sign_idx])
            else:
                sentence.append(actions[sign_idx])
        sentence = sentence[-5:]

    cv2.rectangle(frame, (0,0), (640, 40), (245, 117, 16), -1)
    cv2.putText(frame, ' '.join(sentence), (3,30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

    cv2.imshow("Hand", frame)

    if cv2.waitKey(100) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[2]
[2, 1]
[2, 1, 1]
[2, 1, 1, 1]
[2, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]
[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0]
[1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0]
[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0]
[1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0]
[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0]
[1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0]
[1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0]
[2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 

In [5]:
cap.release()
cv2.destroyAllWindows()