In [None]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import os
import io

from jiwer import wer

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing import image  
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Embedding, Dense, Dropout, Reshape, Layer, Concatenate, MultiHeadAttention, LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [1]:
# Image Data Generators
train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_dataset = train_datagen.flow_from_directory(
    '/kaggle/input/sign-lang/data/train',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

test_dataset = test_datagen.flow_from_directory(
    '/kaggle/input/sign-lang/data/test',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)





Found 42720 images belonging to 10 classes.
Found 5280 images belonging to 10 classes.


In [2]:


tokenizer = Tokenizer()

with open('/kaggle/input/sign-lang/data/groundTruth.txt', 'r', encoding='utf-8') as text_file:
    text_data = text_file.read().splitlines()

init_token, final_token = 'start', 'end'

processed_texts = [f"{init_token} {line} {final_token}" for line in text_data]
tokenizer.fit_on_texts(processed_texts)

if init_token not in tokenizer.word_index:
    tokenizer.word_index[init_token] = len(tokenizer.word_index) + 1
if final_token not in tokenizer.word_index:
    tokenizer.word_index[final_token] = len(tokenizer.word_index) + 1

vocab_size = len(tokenizer.word_index) + 1
text_sequences = tokenizer.texts_to_sequences(processed_texts)
sequence_padding = pad_sequences(text_sequences, maxlen=31)

for processed_text in processed_texts:
    print(processed_text)


start اسم الله end
start الحمد الله end
start جميع الصم العرب  السامع end
start السلام عليكم رحمة الله بركة end
start اليوم اقدم انتم برنامج اخر end
start موضوع دراسة لغة الاشارة العربية end
start كلمات اليوم متفرقة في الدين end
start ايضا كلمات عادية end
start لا شرك الله end
start الله اكبر end


In [3]:
# Feature Extraction
mobile = MobileNetV2(include_top=False, weights='imagenet', pooling='avg', input_shape=(224, 224, 3))
feature_extractor = Model(inputs=mobile.input, outputs=mobile.output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5


In [4]:
def extract_features_from_frames(directory, batch=32):
    image_files = [os.path.join(directory, file) for file in sorted(os.listdir(directory)) if file.endswith('.jpg')]
    total_frames = len(image_files)
    extracted_features = []

    for index in range(0, total_frames, batch):
        end = min(index + batch, total_frames)
        batch_files = image_files[index:end]
        
        loaded_images = [image.img_to_array(image.load_img(img_path, target_size=(224, 224))) for img_path in batch_files]
        loaded_images = np.array(loaded_images) / 255.0

        current_features = feature_extractor.predict(loaded_images, verbose=0)
        extracted_features.extend(current_features)

    final_features = np.array(extracted_features)
#     print("Processed directory:", directory, "Features shape:", final_features.shape)
    return final_features


In [5]:
def get_fasttext_word_vectors(vector_file):
    word_vectors = {}
    with io.open(vector_file, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            word_key = parts[0]
            vector_values = np.array(parts[1:], dtype='float32')
            word_vectors[word_key] = vector_values
    return word_vectors

fasttext_vectors = get_fasttext_word_vectors('/kaggle/input/fasttext-arabic-embeddings/cc.ar.300.vec')

In [6]:
!pip install jiwer

Collecting jiwer
  Obtaining dependency information for jiwer from https://files.pythonhosted.org/packages/0d/4f/ee537ab20144811dd99321735ff92ef2b3a3230b77ed7454bed4c44d21fc/jiwer-3.0.3-py3-none-any.whl.metadata
  Downloading jiwer-3.0.3-py3-none-any.whl.metadata (2.6 kB)
Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Installing collected packages: jiwer
Successfully installed jiwer-3.0.3


In [7]:
def calculate_wer(ground_truth, hypothesis):
    return wer(ground_truth, hypothesis)

In [8]:
def extract_features_for_signer(signer_path):
    """Extract features for a given signer's folder."""
    features = extract_features_from_frames(signer_path)
    if features.shape != (80, 1280):
        print(f"Alert: Wrong feature shape in {os.path.basename(signer_path)}: {features.shape}")
        return None
    return features

def prepare_sequence_for_sentence(sentence_index, truth_texts, tokenizer, max_len):
    """Prepare padded sequence for a given sentence."""
    ground_truth_text = truth_texts[sentence_index - 1]
    sequence = tokenizer.texts_to_sequences([ground_truth_text])
    return pad_sequences(sequence, maxlen=max_len)[0]

def process_folder_and_prepare_data(root_path, tokenizer, truth_texts, max_len=31):
    features_collection, sequences_collection = [], []

    for sentence_dir in sorted(os.listdir(root_path)):
        sentence_path = os.path.join(root_path, sentence_dir)
        if os.path.isdir(sentence_path):
            signer_dirs = [os.path.join(sentence_path, sd) for sd in sorted(os.listdir(sentence_path)) if os.path.isdir(os.path.join(sentence_path, sd))]
            valid_features = [extract_features_for_signer(sd) for sd in signer_dirs]
            features_collection.extend([f for f in valid_features if f is not None])

            if valid_features:
                padded_sequence = prepare_sequence_for_sentence(int(sentence_dir), truth_texts, tokenizer, max_len)
                sequences_collection.extend([padded_sequence] * len(signer_dirs))

    return np.array(features_collection), np.array(sequences_collection)


In [9]:
embedding_dim = 300  
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = fasttext_vectors.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [10]:
training_features, training_seq = process_folder_and_prepare_data(
    '/kaggle/input/sign-lang/data/train', 
    tokenizer, 
    processed_texts
)
print("Dimensions of training_features:", training_features.shape)

training_features = np.flip(training_features, axis=1)

training_targets = np.zeros_like(training_seq)
training_targets[:, :-1] = training_seq[:, 1:]

one_hot_targets = tf.keras.utils.to_categorical(training_targets, num_classes=vocab_size)


Dimensions of training_features: (534, 80, 1280)


# Encoder Decoder Model

In [11]:
# Encoder-Decoder Model Definition
encoder_inputs = Input(shape=(80, 1280))  
# encoder_gru = GRU(units=256, return_state=True)
encoder_lstm = LSTM(units=256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
# encoder_outputs, state_h = encoder_gru(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                              weights=[embedding_matrix], 
                              trainable=False)

decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
# decoder_gru = GRU(units=256, return_sequences=True, return_state=True)
decoder_dense = Dense(vocab_size, activation='softmax')

x = decoder_embedding(decoder_inputs)
x = Dropout(0.25)(x)
# x, _ = decoder_gru(x, initial_state=state_h)
x, _, _ = decoder_lstm(x, initial_state=encoder_states)
x = Dropout(0.25)(x)
decoder_outputs = decoder_dense(x)

no_attention = Model([encoder_inputs, decoder_inputs], decoder_outputs)

no_attention.compile(optimizer='adam', loss='categorical_crossentropy')

# Training the model
no_attention.fit([training_features, training_seq], one_hot_targets, batch_size=32, epochs=30) 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7b23d11e0100>

In [12]:
def interpret_sequence(sequence, tokenizer):
    # Interpreting a sequence into a caption, where 'sequence' is a 2D array (time steps, vocabulary size)
    caption = []
    
    for token_probs in sequence:
        # Choosing the index with the highest probability
        selected_index = np.argmax(token_probs)
        interpreted_word = tokenizer.index_word.get(selected_index, '')
        caption.append(interpreted_word)

    return ' '.join(caption)


def process_and_evaluate_folder(folder_path, model, tokenizer, captions_list):
    wer_metrics = []

    for signer_directory in sorted(os.listdir(folder_path)):
        path_to_signer = os.path.join(folder_path, signer_directory)
        if os.path.isdir(path_to_signer):
            video_features = extract_features_from_frames(path_to_signer)
            video_features = np.expand_dims(video_features, axis=0)  # Reshaping for model prediction

            # Retrieve the actual caption for the current directory
            directory_num = int(os.path.basename(folder_path))
            actual_caption = captions_list[directory_num - 1]
            
            # Preparing the input sequence for the model
            input_seq = tokenizer.texts_to_sequences([actual_caption])
            model_input_seq = pad_sequences(input_seq, maxlen=31)

            # Generating the predicted caption sequence
            predicted_sequence = no_attention.predict([video_features, model_input_seq])
            generated_caption = interpret_sequence(predicted_sequence[0], tokenizer)
            
            print('Predicted: ', generated_caption)
            print('Actual: ', actual_caption)
            
            # Calculating Word Error Rate (WER)
            wer_score = calculate_wer(actual_caption, generated_caption)
            wer_metrics.append(wer_score)

    return np.mean(wer_metrics)


# Testing the model
data_path = '/kaggle/input/sign-lang/data/test'
wer_scores_list = []

for sentence_directory in sorted(os.listdir(data_path)):
    directory_path = os.path.join(data_path, sentence_directory)
    if os.path.isdir(directory_path):
        average_wer = process_and_evaluate_folder(directory_path, no_attention, tokenizer, processed_texts)
        wer_scores_list.append(average_wer)

overall_wer = np.mean(wer_scores_list)
print("Overall Word Error Rate (WER):", overall_wer)


Predicted:                           start start لا الله end 
Actual:  start اسم الله end
Predicted:                           start start ايضا الله end 
Actual:  start اسم الله end
Predicted:                           start start ايضا الله end 
Actual:  start اسم الله end
Predicted:                           start start ايضا الله end 
Actual:  start اسم الله end
Predicted:                            start اسم الله end 
Actual:  start اسم الله end
Predicted:                            start اسم الله end 
Actual:  start اسم الله end
Predicted:                          start start start لا الله end 
Actual:  start الحمد الله end
Predicted:                          start start start لا الله end 
Actual:  start الحمد الله end
Predicted:                          start start start لا الله end 
Actual:  start الحمد الله end
Predicted:                            start اسم الله end 
Actual:  start الحمد الله end
Predicted:                            start ايضا الله end 
Actual:  start الحمد الل

# Encoder Decoder Model with Attention

In [13]:

class Attention(Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W = Dense(units, use_bias=False)

    def call(self, query, values):
        # query shape == (batch_size, seq_len, hidden_size)
        # values shape == (batch_size, hidden_size, seq_len)
        values_transposed = tf.transpose(values, perm=[0, 2, 1])
        
        # Calculate the attention scores
        score = tf.matmul(query, values_transposed)
        
        # attention_weights shape == (batch_size, seq_len, seq_len)
        attention_weights = tf.nn.softmax(score, axis=-1)
        
        # context_vector shape after sum == (batch_size, seq_len, hidden_size)
        context_vector = tf.matmul(attention_weights, values)
        
        return context_vector, attention_weights


In [14]:
# Encoder
encoder_inputs = Input(shape=(80, 1280))  
encoder_gru = GRU(units=256, return_sequences=True, return_state=True)
encoder_outputs, state_h = encoder_gru(encoder_inputs)

# Attention Layer
attention_layer = Attention(256)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)
decoder_gru = GRU(units=256, return_sequences=True)
decoder_dense = Dense(vocab_size, activation='softmax')

x = decoder_embedding(decoder_inputs)
decoder_outputs = decoder_gru(x, initial_state=state_h)

# Applying attention
attention_layer = Attention(256)
context_vector, attention_weights = attention_layer(decoder_outputs, encoder_outputs)

# Concatenate context vector with decoder outputs
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_outputs])

# Output layer
output = decoder_dense(decoder_combined_context)

# Define the model
attention = Model([encoder_inputs, decoder_inputs], output)


attention.compile(optimizer='adam', loss='categorical_crossentropy')
attention.fit([training_features, training_seq], one_hot_targets, batch_size=32, epochs=30) 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7b23cbc84e20>

In [15]:
def interpret_sequence(sequence, tokenizer):
    # Interpreting a sequence into a caption, where 'sequence' is a 2D array (time steps, vocabulary size)
    caption = []
    
    for token_probs in sequence:
        # Choosing the index with the highest probability
        selected_index = np.argmax(token_probs)
        interpreted_word = tokenizer.index_word.get(selected_index, '')
        caption.append(interpreted_word)

    return ' '.join(caption)


def process_and_evaluate_folder(folder_path, model, tokenizer, captions_list):
    wer_metrics = []

    for signer_directory in sorted(os.listdir(folder_path)):
        path_to_signer = os.path.join(folder_path, signer_directory)
        if os.path.isdir(path_to_signer):
            video_features = extract_features_from_frames(path_to_signer)
            video_features = np.expand_dims(video_features, axis=0)  # Reshaping for model prediction

            # Retrieve the actual caption for the current directory
            directory_num = int(os.path.basename(folder_path))
            actual_caption = captions_list[directory_num - 1]
            
            # Preparing the input sequence for the model
            input_seq = tokenizer.texts_to_sequences([actual_caption])
            model_input_seq = pad_sequences(input_seq, maxlen=31)

            # Generating the predicted caption sequence
            predicted_sequence = attention.predict([video_features, model_input_seq])
            generated_caption = interpret_sequence(predicted_sequence[0], tokenizer)
            
            print('Predicted: ', generated_caption)
            print('Actual: ', actual_caption)
            # Calculating Word Error Rate (WER)
            wer_score = calculate_wer(actual_caption, generated_caption)
            wer_metrics.append(wer_score)

    return np.mean(wer_metrics)


# Testing the model
data_path = '/kaggle/input/sign-lang/data/test'
wer_scores_list = []

for sentence_directory in sorted(os.listdir(data_path)):
    directory_path = os.path.join(data_path, sentence_directory)
    if os.path.isdir(directory_path):
        average_wer = process_and_evaluate_folder(directory_path, attention, tokenizer, processed_texts)
        wer_scores_list.append(average_wer)

overall_wer = np.mean(wer_scores_list)
print("Overall Word Error Rate (WER):", overall_wer)


Predicted:                           start start اسم الله end 
Actual:  start اسم الله end
Predicted:                           start start اسم الله end 
Actual:  start اسم الله end
Predicted:                           start start اسم الله end 
Actual:  start اسم الله end
Predicted:                          start start start ايضا الله end 
Actual:  start اسم الله end
Predicted:                           start start ايضا الله end 
Actual:  start اسم الله end
Predicted:                          start start start ايضا الله end 
Actual:  start اسم الله end
Predicted:                          start start start الله الله end 
Actual:  start الحمد الله end
Predicted:                         start start start start الله الله end 
Actual:  start الحمد الله end
Predicted:                           start start الله الله end 
Actual:  start الحمد الله end
Predicted:                          start start start اسم الله end 
Actual:  start الحمد الله end
Predicted:                          start star

# Transformer Model

In [16]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dense(inputs.shape[-1])(x)
    x = Dropout(dropout)(x)
    return x + res

def transformer_decoder(inputs, enc_outputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Encoder-Decoder Attention
    x = LayerNormalization(epsilon=1e-6)(res)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, enc_outputs)
    x = Dropout(dropout)(x)
    res = x + res

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dense(inputs.shape[-1])(x)
    x = Dropout(dropout)(x)
    return x + res

# Hyperparameters
feature_dim = 1280  
num_heads = 8
dropout_rate = 0.1
head_size = 64  
ff_dim = 512 

# Encoder
encoder_inputs = Input(shape=(80, feature_dim))  # Adjusted to match your data shape
enc_out = transformer_encoder(encoder_inputs, head_size, num_heads, ff_dim, dropout_rate)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
dec_out = transformer_decoder(dec_emb, enc_out, head_size, num_heads, ff_dim, dropout_rate)

# Output layer
outputs = Dense(vocab_size, activation='softmax')(dec_out)

# Build the model
transformer = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
transformer.compile(optimizer='adam', loss='categorical_crossentropy')

# Model summary
transformer.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 80, 1280)]           0         []                            
                                                                                                  
 layer_normalization (Layer  (None, 80, 1280)             2560      ['input_6[0][0]']             
 Normalization)                                                                                   
                                                                                                  
 multi_head_attention (Mult  (None, 80, 1280)             2624256   ['layer_normalization[0][0]', 
 iHeadAttention)                                                     'layer_normalization[0][0]'] 
                                                                                            

In [17]:
transformer.fit([training_features, training_seq], one_hot_targets, batch_size=32, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7b23cb8d3c70>

In [18]:
def interpret_sequence(sequence, tokenizer):
    # Interpreting a sequence into a caption, where 'sequence' is a 2D array (time steps, vocabulary size)
    caption = []
    
    for token_probs in sequence:
        # Choosing the index with the highest probability
        selected_index = np.argmax(token_probs)
        interpreted_word = tokenizer.index_word.get(selected_index, '')
        caption.append(interpreted_word)

    return ' '.join(caption)


def process_and_evaluate_folder(folder_path, model, tokenizer, captions_list):
    wer_metrics = []

    for signer_directory in sorted(os.listdir(folder_path)):
        path_to_signer = os.path.join(folder_path, signer_directory)
        if os.path.isdir(path_to_signer):
            video_features = extract_features_from_frames(path_to_signer)
            video_features = np.expand_dims(video_features, axis=0)  # Reshaping for model prediction

            # Retrieve the actual caption for the current directory
            directory_num = int(os.path.basename(folder_path))
            actual_caption = captions_list[directory_num - 1]
            
            # Preparing the input sequence for the model
            input_seq = tokenizer.texts_to_sequences([actual_caption])
            model_input_seq = pad_sequences(input_seq, maxlen=31)

            # Generating the predicted caption sequence
            predicted_sequence = transformer.predict([video_features, model_input_seq])
            generated_caption = interpret_sequence(predicted_sequence[0], tokenizer)
            
            print('Predicted: ', generated_caption)
            print('Actual: ', actual_caption)
            # Calculating Word Error Rate (WER)
            wer_score = calculate_wer(actual_caption, generated_caption)
            wer_metrics.append(wer_score)

    return np.mean(wer_metrics)


# Testing the model
data_path = '/kaggle/input/sign-lang/data/test'
wer_scores_list = []

for sentence_directory in sorted(os.listdir(data_path)):
    directory_path = os.path.join(data_path, sentence_directory)
    if os.path.isdir(directory_path):
        average_wer = process_and_evaluate_folder(directory_path, transformer, tokenizer, processed_texts)
        wer_scores_list.append(average_wer)

overall_wer = np.mean(wer_scores_list)
print("Overall Word Error Rate (WER):", overall_wer)


Predicted:                             اسم الله end 
Actual:  start اسم الله end
Predicted:                             اسم الله end 
Actual:  start اسم الله end
Predicted:                             اسم الله end 
Actual:  start اسم الله end
Predicted:                             اسم الله end 
Actual:  start اسم الله end
Predicted:                             اسم الله end 
Actual:  start اسم الله end
Predicted:                             اسم الله end 
Actual:  start اسم الله end
Predicted:                             الحمد الله end 
Actual:  start الحمد الله end
Predicted:                             الحمد الله end 
Actual:  start الحمد الله end
Predicted:                             الحمد الله end 
Actual:  start الحمد الله end
Predicted:                             الحمد الله end 
Actual:  start الحمد الله end
Predicted:                             الحمد الله end 
Actual:  start الحمد الله end
Predicted:                             الحمد الله end 
Actual:  start الحمد الله end
Pred