In [1]:
import cv2
import numpy as np
import mediapipe as mp
import pickle

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # color conversion
    image.flags.writeable = False # image is no longer writeable
    results = model.process(image) # make prediction
    image.flags.writeable = True # image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # color conversion
    return image, results

In [4]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # draw right hand connections
    

In [5]:
def draw_styled_landmarks(image, results):
    # draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 
    

In [6]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [7]:
# Thirty videos worth of data
no_sequences = 40

# Videos are going to be 30 frames in length
sequence_length = 40

In [8]:
actions = np.array(['angry', 'cat', 'cold', 'dog', 'good', 'happy', 'Hello', 'hot',
       'hungry', 'Im full', 'me', 'no', 'sad', 'sorry', 'Thank you',
       'tired', 'we', 'why', 'worry', 'you'])

In [9]:
label_map = {label:num for num, label in enumerate(actions)}

In [10]:
with open('videos.pkl', 'rb') as tf:
    videos = pickle.load(tf)

In [11]:
with open('sequences.pkl', 'rb') as tf:
    sequences = pickle.load(tf)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [13]:
model = Sequential()
model.add(LSTM(32, return_sequences=True, activation='sigmoid', input_shape=(sequence_length, 1662)))
model.add(LSTM(64, return_sequences=True, activation='sigmoid'))
model.add(LSTM(32, return_sequences=False, activation='sigmoid'))
model.add(Dense(32, activation='sigmoid'))
model.add(Dense(16, activation='sigmoid'))
model.add(Dense(actions.shape[0], activation='softmax'))

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [14]:
model.load_weights('sixth_trial.h5')

# 테스트

- you(80)
- worry(c:2 s:97)
- why(0.02)
- we(96)
- tired(c:0.1 s:97)
- thank you(1)
- sorry(95)
- sad(0.00001)
- no(0.03)
- me(0.1)
- Im full(43)
- hungry(47)
- hot(h:95 h:0.000001)
- Hello(0.002)
- happy(0.08)
- good(0.1)
- dog(97)
- cold(95)
- cat(h:60 h:70)
- angry(0.006)

# GOOD(11) - WIN!!!
### dog cold  you we sorry cat worry hot tired hungry Im full

# BAD(9)
### why thankyou sad no me hello happy good angry

In [15]:
# 1. New detection variables
sequence = []
res = np.zeros((20)) 
threshold = 0.7
action = "dog"

cap = cv2.VideoCapture(cv2.CAP_DSHOW)
# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction Logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-40:]
        
        if len(sequence) == 40:
            res = model.predict(np.expand_dims(sequence, axis=0),verbose=None)[0]
#             print(actions[np.argmax(res)], res[np.argmax(res)])
            
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
#         cv2.putText(image, "{} {}".format(actions[np.argmax(res)], res[np.argmax(res)]*100), (3, 30),
#                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(image, "{} {}".format(action, res[label_map[action]]*100), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow("OpenCV Feed", image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

In [22]:
frame

array([[[ 99, 106, 111],
        [ 97, 104, 109],
        [ 98, 105, 104],
        ...,
        [ 47,  51,  59],
        [ 47,  49,  55],
        [ 46,  48,  54]],

       [[ 96, 102, 110],
        [ 99, 105, 113],
        [ 97, 106, 106],
        ...,
        [ 46,  50,  58],
        [ 46,  48,  49],
        [ 48,  50,  51]],

       [[ 96, 107, 114],
        [ 93, 104, 111],
        [ 99, 105, 108],
        ...,
        [ 45,  48,  60],
        [ 41,  42,  51],
        [ 47,  48,  57]],

       ...,

       [[ 96,  98,  99],
        [103, 105, 106],
        [106, 120, 118],
        ...,
        [117, 119, 119],
        [115, 117, 117],
        [116, 118, 118]],

       [[ 93,  94, 103],
        [ 94,  95, 104],
        [ 97, 106, 111],
        ...,
        [114, 116, 117],
        [110, 112, 112],
        [113, 115, 115]],

       [[ 97,  99, 114],
        [ 92,  94, 109],
        [ 93, 101, 109],
        ...,
        [117, 119, 120],
        [117, 118, 114],
        [118, 119, 115]]