In [4]:
!pip install tensorflow



In [5]:
!pip install opencv-python



In [6]:

!pip uninstall -y mediapipe numpy
!pip cache purge
!pip install numpy mediapipe --no-cache-dir --force-reinstall



[0mFound existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[0mFiles removed: 0
Collecting numpy
  Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting absl-py (from mediapipe)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting attrs>=19.1.0 (from mediapipe)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting flatbuffers>=2.0 (from mediapipe)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting jax (from mediapipe)
  Downloading jax-0.6.2-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.6.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.3 kB)
Colle

# 1. Importing Libraries

In [7]:

import os
import time
import numpy as np
import scipy.stats
import random
import pandas as pd


import matplotlib.pyplot as plt
from IPython.display import display, Image

import cv2
import mediapipe as mp
from google.colab.patches import cv2_imshow

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model



In [95]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [96]:

base_dir = "/content/drive/MyDrive/sign_language_detection"

test_videos_path = os.path.join(base_dir, 'test_videos')
if not os.path.exists(test_videos_path):
    os.makedirs(test_videos_path)

DATA_PATH = os.path.join(base_dir, 'MP_Data')

# 2. Detecting Keypoints using MP Holistic

In [12]:

mp_holistic = mp.solutions.holistic


mp_drawing = mp.solutions.drawing_utils


def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

def draw_styled_landmarks(image, results):

    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )

    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )

    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

Code Block 2B

In [18]:

video_name = "IMAGE_VEDIO1.mp4"
video_path = os.path.join(test_videos_path, video_name)


cap = cv2.VideoCapture(video_path)


if not cap.isOpened():
    raise FileNotFoundError(f"Error: Cannot open video file '{video_path}'. Please check if the file exists and the path is correct.")
mp_holistic = mp.solutions.holistic
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        frame = cv2.flip(frame, 1)

        if not ret:
            break


        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)


        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


        plt.figure(figsize=(10, 10))
        plt.imshow(image_rgb)
        plt.axis('off')
        display(plt.gcf())
        plt.close()

    # cv2.waitKey(1)
    cap.release()

Output hidden; open in https://colab.research.google.com to view.

# 3. Extracting Keypoint Values

Code Block 3A

In [19]:

pose = []


for res in results.pose_landmarks.landmark:

    test = np.array([res.x, res.y, res.z, res.visibility])


    pose.append(test)


pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)


face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

# 4. Setting up Folders for Collection

Code Block 4A

In [20]:
actions = np.array(['hello', 'thank_you', 'see_you_later'])

DATA_PATH = os.path.join(base_dir, 'MP_Data')
VIDEO_PATH = os.path.join(base_dir, 'videos')

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

no_sequences = 20
sequence_length = 20
start_folder = 20

for action in actions:

    action_path = os.path.join(DATA_PATH, action)
    video_path = os.path.join(VIDEO_PATH, action)


    if not os.path.exists(action_path):
        os.makedirs(action_path)


    if not os.path.exists(video_path):
        os.makedirs(video_path)


    existing_dirs = [d for d in os.listdir(action_path) if d.isdigit()]
    if existing_dirs:
        dirmax = np.max(np.array(existing_dirs).astype(int))
    else:
        dirmax = 0


    for sequence in range(0, no_sequences):
        seq_path = os.path.join(action_path, str(dirmax + sequence))
        if not os.path.exists(seq_path):
            os.makedirs(seq_path)

# 5. Collecting Keypoint Values for Training and Testing

In [22]:
!git config --global user.email "draashish2003malik@gmail.com"
!git config --global user.name "Aashish"


In [21]:

base_video_path = os.path.join(base_dir, 'videos')

if not os.path.exists(base_video_path):
    os.makedirs(base_video_path)

for action in actions:

    action_video_path = os.path.join(base_video_path, action)
    action_data_path = os.path.join(DATA_PATH, action)
    os.makedirs(action_data_path, exist_ok=True)


    for i in range(no_sequences):
        sequence_folder = os.path.join(action_data_path, str(i))
        os.makedirs(sequence_folder, exist_ok=True)

    video_counter = 0
    for video_file in os.listdir(action_video_path):
        print(f"\n▶ Processing {video_file}")

        full_video_path = os.path.join(action_video_path, video_file)


        cap = cv2.VideoCapture(full_video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print(f"Total frames: {total_frames}")

        target_frame_count = 30
        #20
        frame_indices = np.linspace(0, total_frames - 1, target_frame_count, dtype=int)

        with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
            for i, frame_index in enumerate(frame_indices):
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
                ret, frame = cap.read()
                if not ret:
                    print(f"⚠ Could not read frame {frame_index}")
                    continue


                image, results = mediapipe_detection(frame, holistic)
                draw_styled_landmarks(image, results)


                keypoints = extract_keypoints(results)


                npy_path = os.path.join(action_data_path, str(i), f"{video_counter}.npy")
                np.save(npy_path, keypoints)
                print(f"✅ Saved to {npy_path}")


        cap.release()
        video_counter += 1


▶ Processing WIN_20250720_22_21_03_Pro.mp4
Total frames: 34
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/0/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/1/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/2/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/3/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/4/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/5/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/6/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/7/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/8/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/9/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/MP_Data/hello/10/0.npy
✅ Saved to /content/drive/MyDrive/sign_language_detection/

# 6. Preprocessing Data and Creating Labels and Features

In [68]:
label_map = {label: num for num, label in enumerate(actions)}

label_map

{np.str_('hello'): 0, np.str_('thank_you'): 1, np.str_('see_you_later'): 2}

In [69]:
sequences, labels = [], []


for action in actions:

    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []


        for frame_num in range(sequence_length):

            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)


        sequences.append(window)


        labels.append(label_map[action])

X = np.array(sequences)
y = to_categorical(labels).astype(int)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)


print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (85, 20, 1662)
X_test shape: (5, 20, 1662)
y_train shape: (85, 3)
y_test shape: (5, 3)


# 7. Building and Training an LSTM Neural Network

In [71]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model = Sequential()

model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))

model.add(LSTM(128, return_sequences=True, activation='relu'))

model.add(LSTM(64, return_sequences=False, activation='relu'))

model.add(Dense(64, activation='relu'))

model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

  super().__init__(**kwargs)


In [72]:
model.compile(optimizer='Adam',

              loss='categorical_crossentropy',

              metrics=['categorical_accuracy'])

In [73]:
model.fit(X_train, y_train, epochs=200, callbacks=[tb_callback])

Epoch 1/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 81ms/step - categorical_accuracy: 0.3542 - loss: 1.2150
Epoch 2/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - categorical_accuracy: 0.3620 - loss: 5.4247
Epoch 3/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - categorical_accuracy: 0.3249 - loss: 2.5190
Epoch 4/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - categorical_accuracy: 0.4697 - loss: 2.5302
Epoch 5/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - categorical_accuracy: 0.3366 - loss: 2.2852
Epoch 6/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - categorical_accuracy: 0.2856 - loss: 1.4125
Epoch 7/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - categorical_accuracy: 0.3718 - loss: 2.3644
Epoch 8/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - cate

<keras.src.callbacks.history.History at 0x7b1c74251990>

# 8. Making Predictions

In [74]:
y_pred = model.predict(X_test)

y_pred.shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453ms/step


(5, 3)

In [75]:
actions[np.argmax(y_pred[3])]

np.str_('see_you_later')

In [76]:
actions[np.argmax(y_test[3])]

np.str_('see_you_later')

# 9. Saving Weights

In [77]:
model_path = os.path.join(base_dir, 'my_model.keras')
model.save(model_path)

In [78]:
model_path = os.path.join(base_dir, 'my_model.keras')
model = load_model(model_path)

In [79]:
y_pred = model.predict(X_test)
y_true = np.argmax(y_test, axis=1).tolist()
y_pred = np.argmax(y_pred, axis=1).tolist()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467ms/step


In [80]:
conf_matrix = multilabel_confusion_matrix(y_true, y_pred)

for idx, matrix in enumerate(conf_matrix):
    print(f"\nConfusion Matrix for class '{actions[idx]}':")
    df = pd.DataFrame(matrix,
                      index=["Actual Negative", "Actual Positive"],
                      columns=["Predicted Negative", "Predicted Positive"])
    print(df)


Confusion Matrix for class 'hello':
                 Predicted Negative  Predicted Positive
Actual Negative                   3                   0
Actual Positive                   0                   2

Confusion Matrix for class 'thank_you':
                 Predicted Negative  Predicted Positive
Actual Negative                   3                   0
Actual Positive                   0                   2

Confusion Matrix for class 'see_you_later':
                 Predicted Negative  Predicted Positive
Actual Negative                   4                   0
Actual Positive                   0                   1


In [81]:
accuracy_score(y_true, y_pred)

1.0

# 11. Testing with Videos

In [100]:

test_video = "IMAGE_VEDIO2.mp4" # TODO: Change to the name of your video
base_test_path = "/content/drive/MyDrive/sign_language_detection/test_videos"
input_video_path = os.path.join(base_test_path, test_video)
output_video_path = os.path.join(base_test_path, "output_video2.mp4")
os.makedirs(base_test_path, exist_ok=True)

colors = [tuple(random.randint(0, 255) for _ in range(3)) for _ in actions]

def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()

    for num, prob in enumerate(res):
        color = colors[num]
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), color, -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

    return output_frame



sequence = []
sentence = []
predictions = []
threshold = 0.5


cap = cv2.VideoCapture(input_video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))


mp_holistic = mp.solutions.holistic
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break


        frame = cv2.flip(frame, 1)


        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)


        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))

            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) == 0 or actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5:
                sentence = sentence[-5:]

            image = prob_viz(res, actions, image, colors)

        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)


        out.write(image)

    cap.release()
    out.release()
    # cv2.destroyAllWindows()