<a href="https://colab.research.google.com/github/arya232004/Sign-Language-Motion-Recognition-and-Translation-for-Videos/blob/main/new_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import csv

# Set the path to your videos folder
videos_folder = '/content/drive/MyDrive/ISL/New Approach /dataset'

# Create lists to store the training and testing data
train_data = []
test_data = []

# Iterate through subfolders in the videos folder
for subfolder in os.listdir(videos_folder):
    subfolder_path = os.path.join(videos_folder, subfolder)

    # Check if it's a directory
    if os.path.isdir(subfolder_path):
        videos = os.listdir(subfolder_path)

        # Shuffle the list of videos
        random.shuffle(videos)

        # Calculate the split index for 80% train and 20% test
        split_index = int(0.8 * len(videos))

        # Split the videos into train and test
        train_videos = videos[:split_index]
        test_videos = videos[split_index:]

        # Add entries to the train_data list
        train_data.extend([(subfolder, video) for video in train_videos])

        # Add entries to the test_data list
        test_data.extend([(subfolder, video) for video in test_videos])

# Define the paths for the train and test CSV files
train_csv_path = '/content/drive/MyDrive/ISL/train.csv'
test_csv_path = '/content/drive/MyDrive/ISL/test.csv'

# Write data to CSV files
def write_to_csv(csv_path, data):
    with open(csv_path, 'w', newline='') as csvfile:
        fieldnames = ['tag', 'video_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for entry in data:
            writer.writerow({'tag': entry[0], 'video_name': entry[1]})

# Write train data to train CSV file
write_to_csv(train_csv_path, train_data)

# Write test data to test CSV file
write_to_csv(test_csv_path, test_data)


In [None]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
  except RuntimeError as e:
    print(e)

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/ISL/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/ISL/test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")


train_df.sample(10)

Total videos for training: 712
Total videos for testing: 187


Unnamed: 0,tag,video_name
193,grow,grow_17.mp4
162,eat,eat_17.mp4
180,grow,grow_10.mp4
409,clock,clock_6.mp4
478,bad,bad_29.mp4
79,christmas,christmas_97.mp4
578,address,address_16.mp4
54,christmas,christmas_3.mp4
670,apple,apple_40.mp4
425,dog,dog_40.mp4


In [None]:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
IMG_SIZE = 224


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [None]:
from tensorflow.keras import layers

def build_feature_extractor():
    # Load InceptionV3 without top layers
    base_model = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )

    # Freeze the weights of the pre-trained layers
    base_model.trainable = False

    # Add a global average pooling layer
    global_average_layer = layers.GlobalAveragePooling2D()(base_model.output)

    # Add a dense layer with more units (features)
    dense_layer = layers.Dense(512, activation='relu')(global_average_layer)

    # Build the model
    model = keras.Model(inputs=base_model.input, outputs=dense_layer, name="feature_extractor")

    return model

feature_extractor = build_feature_extractor()


In [None]:
# Build the feature extractor model
feature_extractor = build_feature_extractor()

# Get the output shape of the feature extractor's final layer
num_features = feature_extractor.output_shape[-1]

print("Number of features:", num_features)


Number of features: 2048


In [None]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))
print(label_processor.get_vocabulary())

labels = train_df["tag"].values
labels = label_processor(labels[..., None]).numpy()

['I dont know', 'address', 'apple', 'bad', 'bathroom', 'beautiful', 'bowl', 'christmas', 'clock', 'die', 'dog', 'door', 'eat', 'expensive', 'family', 'grow']


In [None]:
#Define hyperparameters

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 200
NUM_FEATURES = 512

In [None]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()

    ##take all classlabels from train_df column named 'tag' and store in labels
    labels = df["tag"].values

    #convert classlabels to label encoding
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") # 145,20
    frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") #145,20,2048

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")



print(f"train_labels in train set: {train_labels.shape}")

print(f"test_labels in train set: {test_labels.shape}")

# MAX_SEQ_LENGTH = 20, NUM_FEATURES = 2048. We have defined this above under hyper parameters

Frame features in train set: (712, 200, 512)
Frame masks in train set: (712, 200)
train_labels in train set: (712, 1)
test_labels in train set: (187, 1)


In [None]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

EPOCHS = 30
# Utility for running experiments.
def run_experiment():
    filepath = "./tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

Epoch 1/30
Epoch 1: val_loss improved from inf to 2.78820, saving model to ./tmp/video_classifier
Epoch 2/30
Epoch 2: val_loss did not improve from 2.78820
Epoch 3/30
Epoch 3: val_loss did not improve from 2.78820
Epoch 4/30
Epoch 4: val_loss did not improve from 2.78820
Epoch 5/30
Epoch 5: val_loss did not improve from 2.78820
Epoch 6/30

KeyboardInterrupt: 

In [None]:
import cv2
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Specify the path to the folder containing video subfolders
data_path = '/content/drive/MyDrive/ISL/videos'

# List all subfolders (assuming each subfolder is a word)
word_folders = [folder for folder in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, folder))]

# Initialize lists to store data and labels
data = []
labels = []
max_frames = 200  # Set a fixed number of frames for each video

# Loop through each word
for word_index, word_folder in enumerate(word_folders):
    word_path = os.path.join(data_path, word_folder)

    # Loop through each video file in the word folder
    for video_file in os.listdir(word_path):
        video_path = os.path.join(word_path, video_file)

        # Read video frames using OpenCV
        cap = cv2.VideoCapture(video_path)
        frames = []

        frame_count = 0
        while True:
            ret, frame = cap.read()
            if not ret or frame_count >= max_frames:
                break

            # Preprocess frame as needed
            # (e.g., resize, normalize pixel values)
            # For simplicity, we'll resize frames to 64x64 pixels
            frame = cv2.resize(frame, (64, 64))
            frame = frame / 255.0  # Normalize pixel values to the range [0, 1]

            frames.append(frame)
            frame_count += 1

        # Organize frames into a 3D tensor (num_frames, height, width, channels)
        video_data = np.array(frames)

        # Append data and label
        data.append(video_data)
        labels.append(word_index)

# Pad sequences to ensure all videos have the same number of frames
data_padded = pad_sequences(data, maxlen=max_frames, padding='post', truncating='post', dtype='float32')

# Convert data and labels to numpy arrays
data = np.array(data_padded)
labels = np.array(labels)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Convert labels to one-hot encoding
num_classes = len(word_folders)
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Print shapes of train and test sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (544, 50, 64, 64, 3)
y_train shape: (544, 12)
X_test shape: (137, 50, 64, 64, 3)
y_test shape: (137, 12)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, LSTM, Dense, Flatten, BatchNormalization, Dropout, TimeDistributed

# Define the model
model = Sequential()

# Convolutional layers
model.add(Conv3D(16, kernel_size=(3, 3, 3), activation='relu', input_shape=(50, 64, 64, 3)))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.8))

model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
model.add(MaxPooling3D(pool_size=(2, 2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# Apply TimeDistributed to prepare for LSTM
model.add(TimeDistributed(Flatten()))

# LSTM layer
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.5))

# Flatten to reduce spatial dimensions
model.add(Flatten())

# Dense layer for classification
model.add(Dense(num_classes, activation='softmax'))  # Replace num_classes with the actual number of classes

# Compile the model with an appropriate optimizer, loss, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d_10 (Conv3D)          (None, 48, 62, 62, 16)    1312      
                                                                 
 max_pooling3d_10 (MaxPooli  (None, 24, 31, 31, 16)    0         
 ng3D)                                                           
                                                                 
 batch_normalization_13 (Ba  (None, 24, 31, 31, 16)    64        
 tchNormalization)                                               
                                                                 
 dropout_18 (Dropout)        (None, 24, 31, 31, 16)    0         
                                                                 
 conv3d_11 (Conv3D)          (None, 22, 29, 29, 32)    13856     
                                                                 
 max_pooling3d_11 (MaxPooli  (None, 11, 14, 14, 32)   

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32,  validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e6c2b99dd20>

In [None]:
# Assuming you have already trained the model and have it stored in the 'model' variable

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


5/5 - 0s - loss: 4.0098 - accuracy: 0.0949 - 446ms/epoch - 89ms/step
Test Loss: 4.0098
Test Accuracy: 9.49%


In [None]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model

model = load_model('/content/drive/MyDrive/ISL/eight_seven.h5')

def preprocess_video_for_prediction(video_path, target_frames=150, frame_shape=(64, 64)):
    frames = []
    cap = cv2.VideoCapture(video_path)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, frame_shape)
        frame = frame / 255.0

        frames.append(frame)

    cap.release()

    if len(frames) < target_frames:
        frames.extend([frames[-1]] * (target_frames - len(frames)))
    elif len(frames) > target_frames:
        frames = frames[:target_frames]

    return np.expand_dims(np.array(frames), axis=0)

video_path_to_predict = '/content/drive/MyDrive/ISL/Testing/soham_christmas.mp4'

input_video_data = preprocess_video_for_prediction(video_path_to_predict, target_frames=150)

predictions = model.predict(input_video_data)

predicted_class_index = np.argmax(predictions)
class_index_to_word = {
    0: 'expensive',
    1: 'I dont know',
    2: 'grow',
    3: 'clock',
    4: 'address',
    5: 'apple',
    6: 'dog',
    7: 'beautiful',
    8: 'bathroom',
    9: 'eat',
    10: 'christmas',
    11: 'door',
    12: 'family',
    13: 'bad',
    14: 'die',
    15: 'bowl'
}
print("Predicted Class Index:", predicted_class_index)
print("predicted word is : ", class_index_to_word[predicted_class_index])


Predicted Class Index: 4
predicted word is :  address


In [None]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model

model = load_model('/content/drive/MyDrive/ISL/eight_eight.h5')

def preprocess_video_for_prediction(video_path, target_frames=150, frame_shape=(64, 64)):
    frames = []
    cap = cv2.VideoCapture(video_path)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, frame_shape)
        frame = frame / 255.0

        frames.append(frame)

    cap.release()

    if len(frames) < target_frames:
        frames.extend([np.zeros_like(frames[0])] * (target_frames - len(frames)))
    elif len(frames) > target_frames:
        frames = frames[:target_frames]

    return np.expand_dims(np.array(frames), axis=0)


video_path_to_predict = '/content/drive/MyDrive/ISL/Testing/soham_grow.mp4'

input_video_data = preprocess_video_for_prediction(video_path_to_predict, target_frames=150)

predictions = model.predict(input_video_data)

predicted_class_index = np.argmax(predictions)
class_index_to_word = {
    0: 'expensive',
    1: 'I dont know',
    2: 'grow',
    3: 'clock',
    4: 'address',
    5: 'apple',
    6: 'dog',
    7: 'beautiful',
    8: 'bathroom',
    9: 'eat',
    10: 'christmas',
    11: 'door',
    12: 'family',
    13: 'bad',
    14: 'die',
    15: 'bowl'
}
print("Predicted Class Index:", predicted_class_index)
print("predicted word is : ", class_index_to_word[predicted_class_index])


Predicted Class Index: 14
predicted word is :  die
