# WLASL Dataset Sign Language Model

In [None]:
import os
import json
import numpy as np

In [None]:
# Load the JSON file
with open('/kaggle/input/wlasl2000-resized/wlasl-complete/WLASL_v0.3.json', 'r') as json_file:
    data = json.load(json_file)

# Words to filter by
words_to_filter = ["where","hello","thank you", "go","stop","here","traffic","good","bad","today"]

# Filter the data
filtered_data = [item for item in data if item.get("gloss") in words_to_filter]

# Extract the first 5 "bbox" elements from each item
for item in filtered_data:
    item["instances"] = item["instances"][:5]

# Save the filtered data with the first 5 "bbox" elements to a JSON file
with open('filtered_data_with_5_bbox.json', 'w') as output_file:
    json.dump(filtered_data, output_file, indent=4)

In [None]:
with open('/kaggle/working/filtered_data_with_5_bbox.json', 'r') as f:
    data = json.load(f)

In [None]:
len(data[0]['instances'])

In [None]:
gloss_list = [] # done
video_name_list = [] # done
start_frame = [] # done
end_frame = [] # done
vid_dir = '/kaggle/input/wlasl2000-resized/wlasl-complete/videos'
progress = 0

for word in filtered_data:
    for i in range(5):
        vid_name = os.path.join(vid_dir, f'{word["instances"][i]["video_id"]}.mp4')
        start = word['instances'][i]['frame_start']
        end = word['instances'][i]['frame_end']
        label = word['gloss']

        video_name_list.append(vid_name)
        gloss_list.append(label)
        start_frame.append(start)
        end_frame.append(end)

        progress += 1
        print(f'Progress: {progress} / {len(filtered_data)*5}')

In [None]:
print(f'number of words = {len(gloss_list)}\nnumber of videos = {len(video_name_list)}')

In [None]:
! pip -q install mediapipe
import mediapipe as mp
import cv2

In [None]:
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_face = mp.solutions.face_mesh

def detect_landmarks(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose, \
         mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands, \
         mp_face.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5) as face_mesh:
        
        pose_results = pose.process(frame_rgb)
        hand_results = hands.process(frame_rgb)
        face_results = face_mesh.process(frame_rgb)

    return pose_results, hand_results, face_results

In [None]:
def extract_landmarks_from_video(video_path, start_frame=1, end_frame=-1):
    cap = cv2.VideoCapture(video_path)
    if end_frame < 0:
        end_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if start_frame < 1:
        start_frame = 1
    
    landmarks_data = []
    
    with mp_pose.Pose() as pose, mp_hands.Hands() as hands, mp_face.FaceMesh() as face_mesh:
        while cap.isOpened() and start_frame <= end_frame:
            ret, frame = cap.read()
            if not ret:
                break
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = detect_landmarks(frame)
            landmarks_data.append({
                "frame_number": start_frame,
                "body_pose_landmarks": results[0].pose_landmarks,
                "hand_landmarks": results[1].multi_hand_landmarks,
                "face_mesh_landmarks": results[2].multi_face_landmarks
            })
            start_frame += 1

    cap.release()
    return landmarks_data

In [None]:
def draw_landmarks_on_video(video_path, output_path, start_frame, end_frame, landmarks_data):
    
    cap = cv2.VideoCapture(video_path)
    if end_frame < 0:
        end_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if start_frame < 1:
        start_frame = 1

    frame_count = 1
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while cap.isOpened() and frame_count <= end_frame:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count >= start_frame:
            landmarks = landmarks_data[frame_count - start_frame]
            frame_with_landmarks = draw_landmarks(frame, landmarks)
            out.write(frame_with_landmarks)
            
        frame_count += 1

    cap.release()
    out.release()

In [None]:
def draw_landmarks(frame, landmarks):
    frame_with_landmarks = frame.copy()

    if landmarks["body_pose_landmarks"]:
        for landmark in landmarks["body_pose_landmarks"].landmark:
            x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
            cv2.circle(frame_with_landmarks, (x, y), 2, (0, 0, 255), -1)

    if landmarks["hand_landmarks"]:
        for hand_landmark in landmarks["hand_landmarks"]:
            for landmark in hand_landmark.landmark:
                x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                cv2.circle(frame_with_landmarks, (x, y), 2, (0, 255, 0), -1)

    if landmarks["face_mesh_landmarks"]:
        for face_landmarks in landmarks["face_mesh_landmarks"]:
            for landmark in face_landmarks.landmark:
                x, y = int(landmark.x * frame.shape[1]), int(landmark.y * frame.shape[0])
                cv2.circle(frame_with_landmarks, (x, y), 2, (0, 0, 255), -1)

    return frame_with_landmarks


In [None]:
vid_dir = '/kaggle/working/new_videos'
if not os.path.exists(vid_dir):
    os.mkdir(vid_dir)

In [None]:
numpy_dir = '/kaggle/working/Numpy_files'
if not os.path.exists(numpy_dir):
    os.mkdir(numpy_dir)

In [None]:
ahh = extract_landmarks_from_video('/kaggle/input/wlasl-processed/videos/00426.mp4', 0, -1)

In [None]:
input_path = '/kaggle/input/wlasl-processed/videos/00426.mp4'
output_path = '/kaggle/working/ahh.mp4'

In [None]:
draw_landmarks_on_video(input_path,output_path, 0, -1, ahh)

In [None]:
type(ahh[1]['hand_landmarks']) 

In [None]:
ahh[0]['hand_landmarks'] == None

In [None]:
def extract_data(frame):
    pose_landmarks = []
    face_landmarks = []
    hands_landmarks = []
    if frame['body_pose_landmarks'] != None:
        for landmark in frame['body_pose_landmarks'].landmark:
            pose_landmarks.append([landmark.x, landmark.y, landmark.z])
    else:
        pose_landmarks = np.zeros([1,33,3])

    if frame['hand_landmarks'] != None:
        for landmark in frame['hand_landmarks'][0].landmark:
            hands_landmarks.append([landmark.x, landmark.y, landmark.z])
    else:
        hands_landmarks = np.zeros([1,21,3])

    if frame['face_mesh_landmarks'] != None:
        for landmark in frame['face_mesh_landmarks'][0].landmark:
            face_landmarks.append([landmark.x, landmark.y, landmark.z])
    else:
        face_landmarks = np.zeros([1,468,3])
    if isinstance(pose_landmarks, np.ndarray):
        pose = pose_landmarks
    else:
        pose = np.array([pose_landmarks])

    if isinstance(face_landmarks, np.ndarray):
        face = face_landmarks
    else:
        face = np.array([face_landmarks])
    if isinstance(hands_landmarks, np.ndarray):
        hands = hands_landmarks
    else:
        hands = np.array([hands_landmarks])
    stacked_landmarks = np.hstack((pose, hands, face))
    return (stacked_landmarks)

In [None]:
def process_video(landmarks,frame_count):
    stacked_arrays = [extract_data(landmarks[i]) for i in range(frame_count)]
    temp = np.vstack(stacked_arrays)
    return temp

In [None]:
for i in range(50):
    input_path = video_name_list[i]
    start = start_frame[i]
    end = end_frame[i]
    npy_path = f'/kaggle/working/Numpy_files/{i}.npy'
    temp = extract_landmarks_from_video(input_path, start_frame[i],end_frame[i])
    npyArray = process_video(temp,len(temp))
    np.save(npy_path,npyArray)

In [None]:
!zip -r numpy.zip /kaggle/working/Numpy_files

In [None]:
import tensorflow as tf

In [None]:
npy_inputs = []
labels = []
for i in range(50):
    input = np.load(f'/kaggle/working/Numpy_files/{i}.npy')
    npy_inputs.append(tf.convert_to_tensor(input))
    labels.append(gloss_list[i])

In [None]:
max_sequence = 0
for i in range(50):
    f,_,_ = npy_inputs[i].shape
    max_sequence = max(max_sequence,f)

In [None]:
max_sequence

In [None]:
new_inputs = []
for i in range(50):
    pad_length = max_sequence - npy_inputs[i].shape[0]
    padded_frame = np.pad(npy_inputs[i], ((0, pad_length), (0, 0), (0, 0)), mode='constant', constant_values=10)
    new_inputs.append(padded_frame)

new_inputs = np.array(new_inputs)
reshaped_inputs = new_inputs[:].reshape(-1, max_sequence, 522 * 3)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(labels)

In [None]:
X = reshaped_inputs
y = label_encoder.transform(labels)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, test_data = train_test_split(X, test_size=0.2, random_state=42)
train_labels, test_labels = train_test_split(y, test_size=0.2, random_state=42)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Define your neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(max_sequence, 522*3)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_data, train_labels, validation_data=(val_data, val_labels), epochs=25)

In [None]:
test_loss, test_accuracy = model.evaluate(test_data, test_labels)
print(f"Test Accuracy: {test_accuracy*100}")

In [None]:
tf.keras.backend.clear_session()