In [2]:
import cv2
import os
import numpy as np
import mediapipe as mp
from matplotlib import pyplot as plt
import time

In [3]:
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

In [4]:
def mediapipe_detection(frame, model):
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    return image, results

In [5]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [6]:
def extract_landmarks(results):
    try:
        landmarks = results.pose_landmarks.landmark
    except:
        pass

    return landmarks

In [7]:
def calc_angles(a,b,c):
    a = np.array(a)
    b = np.array(b)
    c = np.array(c)

    radians = np.arctan2(c[1]-b[1], c[0]-b[0]) - np.arctan2(a[1]-b[1], a[0]-b[0])
    angle = np.abs(radians*180.0/np.pi)

    if angle>180.0:
        angle = 360.0 - angle

    return angle

In [8]:
#USING ANGLES

cam = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.8, min_tracking_confidence=0.5) as holistic:
    while cam.isOpened():
        ret, frame = cam.read()
    
        image, results = mediapipe_detection(frame, holistic)
        
        landmarks = extract_landmarks(results)
        draw_landmarks(image, results)
        
        shoulder = [landmarks[mp_holistic.PoseLandmark.LEFT_SHOULDER.value].x, landmarks[mp_holistic.PoseLandmark.LEFT_SHOULDER.value].y]
        elbow = [landmarks[mp_holistic.PoseLandmark.LEFT_ELBOW.value].x, landmarks[mp_holistic.PoseLandmark.LEFT_ELBOW.value].y]
        wrist = [landmarks[mp_holistic.PoseLandmark.LEFT_WRIST.value].x, landmarks[mp_holistic.PoseLandmark.LEFT_WRIST.value].y]

        angle = calc_angles(shoulder, elbow, wrist)

        cv2.putText(image, str(angle), tuple(np.multiply(elbow ,[640, 480]).astype(int)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 2, cv2.LINE_AA)
        cv2.imshow("Sign Language Converter", cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        #print(results.right_hand_landmarks)
        
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cam.release()
cv2.destroyAllWindows()

In [None]:
for i in mp_holistic.PoseLandmark:
    print(i)

In [25]:
landmarks[mp_holistic.PoseLandmark.LEFT_SHOULDER.value]

x: 0.6480722427368164
y: 0.672619640827179
z: -0.6273075342178345
visibility: 0.9991098046302795

In [26]:
landmarks[mp_holistic.PoseLandmark.LEFT_ELBOW.value]

x: 0.7651079297065735
y: 0.9478564858436584
z: -0.45650210976600647
visibility: 0.6694594621658325

In [27]:
landmarks[mp_holistic.PoseLandmark.LEFT_WRIST.value]

x: 0.831436276435852
y: 1.3195998668670654
z: -0.6808949112892151
visibility: 0.14208954572677612

In [34]:
calc_angles(shoulder, elbow, wrist)

167.0804707000996

In [131]:
DATA_PATH = os.path.join('MP_data')
actions = np.array(["thumbsUp", "thumbsDown", "thankYou"])
no_sequences = 30
sequence_length = 30

In [132]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [133]:
#Record

cam = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.8, min_tracking_confidence=0.5) as holistic:
    
    for action in actions:
        for sequence in range(no_sequences):
            for frame_no in range(sequence_length):
                
                ret, frame = cam.read()
                image, results = mediapipe_detection(frame, holistic)
                draw_landmarks(image, results)
                
                #print(results.right_hand_landmarks)
                #extract_keypoints(results)
                if frame_no == 0:
                    cv2.putText(image, "Collecting Video for {} Video number {}".format(action, sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1, cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                     cv2.putText(image, "Collecting Video for {} Video number {}".format(action, sequence), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,0,0), 1, cv2.LINE_AA)

                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_no))
                np.save(npy_path, keypoints)
                
                cv2.imshow("Input Sign Language", cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
                
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    cam.release()
                    cv2.destroyAllWindows()

    cam.release()
    cv2.destroyAllWindows()

In [94]:
cam.release()
cv2.destroyAllWindows()     ###OPTIONAL OPTIONAL

In [134]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [135]:
label_map = {label: num for num, label in enumerate(actions)}

In [136]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_no in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_no)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [137]:
x = np.array(sequences)
y = np.array(labels)
y = to_categorical(labels).astype(int)

In [138]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.05)

In [100]:
Y_test

array([[1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0]])

In [142]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [143]:
log_dir = os.path.join("Logs")
tb_callback = TensorBoard(log_dir=log_dir)

In [144]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation="relu", input_shape=(30, 258)))
model.add(LSTM(128, return_sequences=True, activation="relu"))
model.add(LSTM(64, return_sequences=False, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(actions.shape[0], activation="softmax")) #will return a list of probability i.e [0.7, 0.2, 0.1] where sum is 1 and action[0] which is thumbsUp is the prediction with 70% sure



In [145]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["categorical_accuracy"])

In [161]:
model.fit(X_train, Y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000

KeyboardInterrupt: 

In [147]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 30, 64)            82688     
                                                                 
 lstm_10 (LSTM)              (None, 30, 128)           98816     
                                                                 
 lstm_11 (LSTM)              (None, 64)                49408     
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                                 
 dense_10 (Dense)            (None, 32)                2080      
                                                                 
 dense_11 (Dense)            (None, 3)                 99        
                                                                 
Total params: 237,251
Trainable params: 237,251
Non-tr

In [162]:
res = model.predict(X_test)



In [180]:
np.sum(res)

1.0

In [169]:
actions[np.argmax(res[0])]

'thumbsUp'

In [171]:
actions[np.argmax(Y_test[0])]

'thumbsUp'

In [172]:
model.save("action.h5")

In [173]:
res

array([[1.0000000e+00, 3.6169749e-09, 1.6036412e-11],
       [8.6566509e-17, 1.0000000e+00, 1.4964719e-38],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 2.0073521e-28, 2.2527269e-15],
       [9.9763238e-01, 2.3675766e-03, 2.1207857e-17]], dtype=float32)

In [174]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1, cv2.LINE_AA)
        
    return output_frame

In [184]:
#PREDICTION AND TESTING

sequence = []
sentence = []
predictions = []
threshold = 0.5

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        #print(results)
        
        # Draw landmarks
        draw_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown
thumbsDown