In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import time
import mediapipe as mp
import os

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import multilabel_confusion_matrix, accuracy_score


### keypoints using MP holistic

In [2]:
mp_holistic = mp.solutions.holistic # Modelo Holístico
mp_desenho = mp.solutions.drawing_utils # Utilidades para desenhos.

In [3]:
def mediapipe_detection(imagem, modelo):
    imagem = cv2.cvtColor(imagem, cv2.COLOR_BGR2RGB) # Conversão de cor, de BGR to RGB.
    imagem.flags.writeable = False                   # Desabilita a escrita na imagem
    results = modelo.process(imagem)                 # Faz as previsões.
    imagem.flags.writeable = True                    # Abilita a escrita na imagem.
    imagem = cv2.cvtColor(imagem, cv2.COLOR_RGB2BGR) # Conversão de cor, de RGB to BGR.
    return imagem, results

In [4]:
def desenhar_pontos(imagem, resultados):
    mp_desenho.draw_landmarks(imagem, resultados.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                              mp_desenho.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1), # Cor/Estilo dos pontos.
                              mp_desenho.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1) # Cor/Estilo das conexões.
                              ) # Desenhar as conexões e os pontos no contorno da FACE
    
    mp_desenho.draw_landmarks(imagem, resultados.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_desenho.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4),
                              mp_desenho.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
                              ) # Desenhar as conexões e os pontos no Tronco
    
    mp_desenho.draw_landmarks(imagem, resultados.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_desenho.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                              mp_desenho.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
                              ) # Desenhar as conexões e os pontos na mão esquerda
    
    mp_desenho.draw_landmarks(imagem, resultados.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_desenho.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                              mp_desenho.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
                              ) # Desenhar as conexões e os pontos na mão direita
    return imagem


In [5]:

"""cap = cv2.VideoCapture(0) # Acessa a nossa Webcam.
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Obre o modelo usando ooperador de contexto.
    while cap.isOpened():

        # Lê o frame:
        ret, frame = cap.read()

        image, results = mediapipe_detection(frame, holistic)
        
        # Desenha os landmarks:
        desenhar_pontos(image, results)
        
        # Mostra a tela:
        cv2.imshow('OpenCV Feed', image)

        # Sai do loop depois de 10 seg ou quando o usuário aperta a letra "q":
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()"""

'cap = cv2.VideoCapture(0) # Acessa a nossa Webcam.\nwith mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Obre o modelo usando ooperador de contexto.\n    while cap.isOpened():\n\n        # Lê o frame:\n        ret, frame = cap.read()\n\n        image, results = mediapipe_detection(frame, holistic)\n        \n        # Desenha os landmarks:\n        desenhar_pontos(image, results)\n        \n        # Mostra a tela:\n        cv2.imshow(\'OpenCV Feed\', image)\n\n        # Sai do loop depois de 10 seg ou quando o usuário aperta a letra "q":\n        if cv2.waitKey(10) & 0xFF == ord(\'q\'):\n            break\n\n    cap.release()\n    cv2.destroyAllWindows()'

### Extract Keypoint Values

In [6]:
# Função para extrair as variáveis espaciais dos landmarks de cada frame:
def extract_points(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    return np.concatenate([pose, face, left_hand, right_hand])

### Setup Folders for Collection

In [6]:
DATA_PATH = os.path.join('MP_DATA')

# Classes/Ações que queremos detectar: (Podemos expandir isso!!)
actions = np.array(['hello', 'thanks', 'iloveyou'])
 
number_of_videos = 30 # Número de sequências ou vídeos que iremos coletar os frames (Pode ser modificado!)
sequence_length = 30 # Para cada vídeo, iremos coletar 30 frames. (Pode ser modificado!)

In [11]:
for action in actions:
    for sequence in range(number_of_videos):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except FileExistsError:
            print("Folder já existe!!")

### Collecting Keypoints Values for training and testing

In [13]:
"""cap = cv2.VideoCapture(0) # Acessa a nossa Webcam.

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Obre o modelo usando ooperador de contexto.

    for action in actions:
        for sequence in range(number_of_videos):
            for numero_frame in range(sequence_length):
                
                # Lê o frame:
                ret, frame = cap.read()

                image, results = mediapipe_detection(frame, holistic)
                    
                # Desenha os landmarks:
                desenhar_pontos(image, results)
                
                # Lógica para organizar a coleta dos dados:
                if numero_frame == 0:
                    cv2.putText(image, 'STARTING COLLECTION', (120, 200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Coletando os frames para {} - video numero {}'.format(action, sequence), (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.waitKey(5000) # Aguarda 2000ms = 2s
                else:
                    cv2.putText(image, 'Coletando os frames para {} - video numero {}'.format(action, sequence), (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)


                results_extraction = extract_points(results)
                numpy_file_path = os.path.join(DATA_PATH, action, str(sequence), str(numero_frame))
                np.save(numpy_file_path, results_extraction)

                # Mostra a tela:
                cv2.imshow('OpenCV Feed', image)
                
                # Sai do loop depois de 10 seg ou quando o usuário aperta a letra "q":
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    cap.release()
    cv2.destroyAllWindows()"""

### Preprocess Data and Creating labels and Features

In [7]:
label_map = {label:num for num, label in enumerate(actions)}

In [47]:
sequences, labels = [], []

for action in actions:
    for sequence in range(number_of_videos):
        janela = []
        for numero_frame in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(numero_frame)))
            janela.append(res)
        sequences.append(janela)
        labels.append(label_map[action])

In [48]:
x = np.array(sequences)
y = to_categorical(labels).astype(int)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)

In [51]:
X_test.shape

(5, 30, 1662)

In [52]:
y_test.shape

(5, 3)

In [53]:
y_train.shape

(85, 3)

Training a LSTM Neural Network

In [76]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)
earlystopping = EarlyStopping(patience=10, restore_best_weights=True)

In [77]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu')) # return_sequences False, pois não iremos retornar as sequencias para a camada Fully connected.
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [78]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [80]:
model.fit(X_train, y_train, epochs=1700, callbacks=[tb_callback])

Epoch 1/1700
Epoch 2/1700
Epoch 3/1700
Epoch 4/1700
Epoch 5/1700
Epoch 6/1700
Epoch 7/1700
Epoch 8/1700
Epoch 9/1700
Epoch 10/1700
Epoch 11/1700
Epoch 12/1700
Epoch 13/1700
Epoch 14/1700
Epoch 15/1700
Epoch 16/1700
Epoch 17/1700
Epoch 18/1700
Epoch 19/1700
Epoch 20/1700
Epoch 21/1700
Epoch 22/1700
Epoch 23/1700
Epoch 24/1700
Epoch 25/1700
Epoch 26/1700
Epoch 27/1700
Epoch 28/1700
Epoch 29/1700
Epoch 30/1700
Epoch 31/1700
Epoch 32/1700
Epoch 33/1700
Epoch 34/1700
Epoch 35/1700
Epoch 36/1700
Epoch 37/1700
Epoch 38/1700
Epoch 39/1700
Epoch 40/1700
Epoch 41/1700
Epoch 42/1700
Epoch 43/1700
Epoch 44/1700
Epoch 45/1700
Epoch 46/1700
Epoch 47/1700
Epoch 48/1700
Epoch 49/1700
Epoch 50/1700
Epoch 51/1700
Epoch 52/1700
Epoch 53/1700
Epoch 54/1700
Epoch 55/1700
Epoch 56/1700
Epoch 57/1700
Epoch 58/1700
Epoch 59/1700
Epoch 60/1700
Epoch 61/1700
Epoch 62/1700
Epoch 63/1700
Epoch 64/1700
Epoch 65/1700
Epoch 66/1700
Epoch 67/1700
Epoch 68/1700
Epoch 69/1700
Epoch 70/1700
Epoch 71/1700
Epoch 72/1700
E

<keras.src.callbacks.History at 0x18940516610>

In [82]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_7 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_8 (LSTM)               (None, 64)                49408     
                                                                 
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 3)                 99        
                                                                 
Total params: 596675 (2.28 MB)
Trainable params: 59667

### Make predictions

In [83]:
res = model.predict(X_test)



### Save Weights

In [84]:
model.save(r'modelos\action.h5')

  saving_api.save_model(


In [86]:
model = load_model(r'modelos\action.h5')

### Evaluation using COnfusion Matrix and Accuracy

In [90]:
yhat = model.predict(X_test)



In [92]:
yhat = [np.argmax(line) for line in yhat]
ytrue = [np.argmax(line) for line in y_test]

### Test real time

In [7]:
sequence = []
sentence = []
threshold = 0.5

actions = np.array(['hello', 'thanks', 'iloveyou'])
modelo_dele = load_model(r'modelos\action.h5')

cap = cv2.VideoCapture(0) # Acessa a nossa Webcam.
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Obre o modelo usando ooperador de contexto.
    while cap.isOpened():

        # Lê o frame:
        ret, frame = cap.read()
        image, results = mediapipe_detection(frame, holistic)
        
        # Desenha os landmarks:
        desenhar_pontos(image, results)

        # Lógica de predição:
        keypoints = extract_points(results) # Extrai as features do frame
        sequence.append(keypoints) # Adiciona na lista 
        sequence = sequence[-30:] # Os ultimos 30 frames para gerar a predição.

        if len(sequence) == 30:
            res = modelo_dele.predict(np.expand_dims(sequence, axis=0))[0]
            print('Actions:', np.argmax(res))
        
        # Lógica de Visualização:
            if res[np.argmax(res)] > threshold:
                if len(sentence) > 0:
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])
                
            if len(sentence) > 5:
                sentence = sentence[-5:]
        
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)


        # Mostra a tela:
        cv2.imshow('OpenCV Feed', image)

        # Sai do loop depois de 10 seg ou quando o usuário aperta a letra "q":
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 0
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 2
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1
Actions: 1