## Overview
This notebook contains a Deep Learning model to classify the signs associated with each of the English alphabetic letters. The training as well as testing sets are provided in [Kaggle Platform](https://www.kaggle.com/datasets/datamunge/sign-language-mnist) 

In [None]:
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt


In [None]:
main_directory = os.path.join("utility_files", "Sign_language_MNIST") # the root directory for the data


### Setting the directories 
The dataset is provided in Kaggle in an zip file. The datasets then should be extracted. The few cells below execute this process

In [None]:
import zipfile
initial_archive_name = "archive.zip"
archive_zip = os.path.join(main_directory, initial_archive_name)
if os.path.exists(archive_zip):
    zip_ref = zipfile.ZipFile(archive_zip, 'r')
    zip_ref.extractall(main_directory)
    zip_ref.close()

In [None]:
TRAINING_DIR_NAME = "sign_mnist_train"
TESTING_DIR_NAME = "sign_mnist_test"

In [None]:
# removing all files / directries that are neither training or testing set
# from genericpath import isdir

# for file in os.listdir(main_directory):    
#     if file not in [TRAINING_DIR_NAME, TESTING_DIR_NAME]:
#         file_name = os.path.join(main_directory, file)
#         if os.path.isfile(file_name):
#             os.remove(file_name)
#         elif os.path.isdir(file_name):
#             os.rmdir(file_name)

print(os.listdir(main_directory))   

### Working with the training files
After removing the unnecessary directories, it is time to understand the data provided and perform the necessary data preprocessing before creating any model.

In [None]:
TRAIN_DIR = os.path.join(main_directory, TRAINING_DIR_NAME)
TEST_DIR = os.path.join(main_directory, TESTING_DIR_NAME)

In [None]:
# print the content of the trianing diretory
print(os.listdir(TRAIN_DIR))
print(os.listdir(TEST_DIR))

In [None]:
## the data is stored in a csv which might call to the use of the pandas library
import pandas as pd
train_file = os.path.join(TRAIN_DIR, os.listdir(TRAIN_DIR)[0])
train_df_org = pd.read_csv(train_file) 

test_file = os.path.join(TEST_DIR, os.listdir(TEST_DIR)[0])
test_df_org = pd.read_csv(test_file)


In [None]:
print(train_df_org.shape)
# so we can see we have 785 columns and 27455 training samples
print(train_df_org.columns)
# as we can see the first picture is the label while the rest represent numerical values of the individual pixels

new_cols_name = {}
new_cols_name['label'] = "y"
for i in range(1, 785):
    new_cols_name["pixel{}".format(str(i))] = str(i)
train_df = train_df_org.rename(columns=new_cols_name)
test_df = test_df_org.rename(columns=new_cols_name)

In [None]:
y_train = train_df["y"]
train_df.drop("y", inplace=True,axis=1)
# print(train_df.columns) 

y_test = test_df["y"]
test_df.drop("y", inplace=True,axis=1)
# print(train_df.columns)

In [None]:
img_size = (28, 28, 1)

In [None]:
# according to the data documentation, the images are meant to be 28 * 28 gray scale images
pixel_range = 255.0
def transform_row_to_pic(df, index):
    return df.iloc[i, :].values.reshape(img_size) / pixel_range

In [None]:
import string
import random
## Visualization
sample = 20
  
plt.figure(figsize=(10, 10))
for i in range(sample):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    index = random.randint(0, len(train_df))
    plt.imshow(transform_row_to_pic(train_df, index), cmap=plt.cm.binary)
    plt.xlabel(string.ascii_letters[y_train[index]]) # the numerical label associated with the hand sign is the letter's order in the alphabet

plt.show()

In [None]:
for i in range(sample):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    index = random.randint(0, len(test_df))
    plt.imshow(transform_row_to_pic(test_df, index), cmap=plt.cm.binary)
    plt.xlabel(string.ascii_letters[(y_test[index])]) # the numerical label associated with the hand sign is the letter's order in the alphabet

plt.show()

In [None]:
def df_to_X(df):
    """This method coverts the training dataframe to """
    return np.array([df.iloc[i, :].values / pixel_range for i in range(len(df))])

In [None]:
X_train = df_to_X(train_df)
Y_train = y_train.values
print(X_train[:3])

X_test = df_to_X(test_df)
Y_test = y_test.values
print(X_test[:3])


In [None]:
# create a validation data set
import sklearn
from sklearn.model_selection import train_test_split
val_size = 0.2
random_state = 11
X_train, X_val, Y_train, Y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)

In [None]:
X_train_img = np.array([np.reshape(row,(28, 28, 1)) for row in X_train])

In [None]:
# DL models imports
import tensorflow as tf
import tensorflow.keras.layers as tfl

### Data Augmentation
We need to experiement with data augmentation: mainly cropping and reflecting.

In [None]:
from tensorflow.image import flip_left_right
from tensorflow.image import central_crop
from tensorflow.image import resize
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

def augmented_image(image, t=0.5, central_frac=0.8, res=True):
    p = random.random()
    new_image = None
    if p > t:
        new_image =  central_crop(image, central_fraction=0.8)
    else:
        new_image = flip_left_right(image)
    if res:
        return resize(new_image, [image.shape[0], image.shape[1]], method="nearest")
    return new_image


In [None]:
X_train_aug = np.array([augmented_image(image).reshape((-1, )) for image in X_train_img])

In [None]:
print(len(X_train_aug))
print(len(X_train))
print(X_train_aug.shape)
print(X_train.shape)

In [None]:
X_train_final = np.append(X_train, X_train_aug, axis=0)
Y_train_final = np.append(Y_train, Y_train, axis=0)

In [None]:
print(len(X_train_final))
print(len(Y_train_final))

## Models
In this part of the notebook, I will try to consider different models, optimize each of them starting from Plain Neural networks to plain CNN, CNN with Residual blocks, and finally a model based on transfer learning.

### Plain Neural Networks 
So the first approach to solve this problem is to use Plain neural networks that uses all of the bytes in each picture. The main goal is to achieve as high of a performance as possible with each available model.

In [None]:
# build the model: 
#input_shape = (1, 784)
first_model = tf.keras.Sequential([
    # tfl.Flatten(input_shape=input_shape),
    tfl.Dense(1024, activation='relu'), 
    tfl.Dense(1024, activation='relu'), 
    tfl.BatchNormalization(), 
    tfl.Dense(512, activation='relu'), 
    tfl.Dense(512, activation='relu'),
    tfl.BatchNormalization(),
    tfl.Dense(256, activation='relu'), 
    tfl.Dense(128, activation='relu'),
    tfl.BatchNormalization(),
    tfl.Dense(64, activation='relu'),
    tfl.Dense(32, activation='relu'), 
    tfl.BatchNormalization(),  
    tfl.Dense(26, activation='softmax')] # output layer
    ) 

# compile the model
EPOCHS = 15



In [None]:
first_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
# history_aug = first_model.fit(X_train_final, Y_train_final, epochs=EPOCHS, validation_data=(X_val, Y_val))
# print(first_model.evaluate(X_test, y_test))


In [None]:
# acc = history_aug.history['accuracy']
# val_acc = history_aug.history['val_accuracy']
# loss = history_aug.history['loss']
# val_loss = history_aug.history['val_loss']

# epochs = range(len(acc))

# plt.plot(epochs, acc, 'r', label='Training accuracy')
# plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
# plt.title('Training and validation accuracy')
# plt.legend(loc=0)
# plt.figure()

# plt.show()

In [None]:
def dnn_with_dropout():
    model = tf.keras.Sequential([
    tfl.Dense(1024, activation='relu'),
    tfl.Dropout(0.4),
    
    tfl.Dense(1024, activation='relu'),
    tfl.BatchNormalization(axis=-1), 
    tfl.Dropout(0.4),

    tfl.Dense(512, activation='relu'),
    tfl.Dropout(0.2),

    tfl.Dense(512, activation='relu'),
    tfl.BatchNormalization(axis=-1), 
    tfl.Dropout(0.2),
    
    tfl.Dense(256, activation='relu'),
    tfl.Dropout(0.1),
    
    tfl.Dense(128, activation='relu'),
    tfl.BatchNormalization(axis=-1),
    tfl.Dropout(0.05),

    tfl.Dense(64, activation='relu'),    
    tfl.Dense(26, activation='softmax')]) # output layer
    return model

In [None]:
second_model = dnn_with_dropout()
second_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
history_aug = second_model.fit(X_train_final, Y_train_final, epochs=EPOCHS, validation_data=(X_val, Y_val))
print(second_model.evaluate(X_test, Y_test))


In [None]:
def dnn_with_l2(input_shape=(784, )):
    inputs = tfl.Input(shape=input_shape)
    X = tfl.Dense(1024, activation='relu', kernel_regularizer='l2')(inputs)
    X = tfl.Dense(1024, activation='relu', kernel_regularizer='l2')(X) 
    X = tfl.BatchNormalization(axis=-1)(X) 
    X = tfl.Dense(512, activation='relu', kernel_regularizer='l2')(X)

    X = tfl.Dense(512, activation='relu', kernel_regularizer='l2')(X)
    X = tfl.BatchNormalization(axis=-1)(X)
    X = tfl.Dense(256, activation='relu', kernel_regularizer='l2')(X)
    
    X = tfl.Dense(128, activation='relu', kernel_regularizer='l2')(X)
    X = tfl.BatchNormalization(axis=-1)(X)
    
    X = tfl.Dense(64, activation='relu', kernel_regularizer='l2')(X)   
    outputs = tfl.Dense(26, activation='softmax')(X) # output layer
    
    return tf.keras.Model(inputs, outputs)    

In [None]:
third_model = dnn_with_l2()
third_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
third_model.fit(X_train_final, Y_train_final)
print(third_model.evaluate(X_test, Y_test))

## TRAIN / TEST similarity
The models created so far perform remarkably well on the validation set while having modest results on the test set. This raises the possibility of having inheritent characteristics / features in the test set that are simply absent in the training dataset. It might be worthwhile exploring this possibility.

In [None]:
## first let's check out manually the optimal central portion of the image: as 784 features might represent a challenge to any model

sample = 20

u_lab = np.unique(y_train.values)
occ = np.zeros(np.max(u_lab)) - 1
print(occ)
count = 0
i = 0
print(u_lab)

u_lab_test = np.unique(y_test.values)
print(u_lab_test)

while count < len(u_lab):
    if occ[y_train[i] - 1] == -1:
        occ[y_train[i] - 1] += i
        count += 1
    i += 1

occ = occ.astype(int)
print(occ)
  


In [None]:
plt.figure(figsize=(30, 30))
for i in range(len(occ))[:5]:
    index = occ[i]
    if index != -1 :
        plt.subplot(5,5, i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(augmented_image(transform_row_to_pic(train_df, index) * 255.0, t=0.0, central_frac=0.9, res=False), cmap=plt.cm.binary)
        
        plt.xlabel(string.ascii_letters[y_train[index]]) # the numerical label associated with the hand sign is the letter's order in the alphabet

plt.show()
plt.figure(figsize=(30, 30))

for i in range(len(occ))[:5]:
    index = occ[i]
    if index != -1 :
        plt.subplot(5,5, i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(transform_row_to_pic(train_df, index) * 255.0, cmap=plt.cm.binary)
        plt.xlabel(string.ascii_letters[y_train[index]]) # the numerical label associated with the hand sign is the letter's order in the alphabet

plt.show()


In [None]:
# as we can see a central crop of  90% is good enough.

train_df['y'] = pd.Series([1 for _ in range(len(train_df))])
test_df['y'] = pd.Series(0 for _ in range(len(test_df)))
all_data = pd.concat([train_df, test_df])
all_data_y = all_data['y']
all_data = all_data.drop("y", axis=1)
all_data = np.array([augmented_image(transform_row_to_pic(all_data, i), t=0.0, central_frac=0.9, res=False).reshape(24, 24) \
for i in range(len(all_data))])
print(all_data[:5])

In [None]:
all_data.shape

In [None]:
data_X_train, data_X_test, data_y_train, data_y_test = \
train_test_split(all_data, all_data_y.values, test_size=0.2, random_state=random_state, stratify=all_data_y.values)


In [None]:
## let's consider a powerful model

similarity_model = tf.keras.Sequential([
    # tfl.Flatten(input_shape=input_shape),
    tfl.Flatten(),
    tfl.Dense(1024, activation='relu'), 
    tfl.Dense(1024, activation='relu'), 
    tfl.BatchNormalization(), 
    tfl.Dense(512, activation='relu'), 
    tfl.Dense(512, activation='relu'),
    tfl.BatchNormalization(),
    tfl.Dense(256, activation='relu'), 
    tfl.Dense(128, activation='relu'),
    tfl.BatchNormalization(),
    tfl.Dense(64, activation='relu'),
    tfl.Dense(32, activation='relu'), 
    tfl.BatchNormalization(),  
    tfl.Dense(1, activation='sigmoid')]) # output layer 


In [None]:
similarity_model.compile(optimizer='adam', metrics=['acc', tf.keras.metrics.AUC()], loss=tf.keras.losses.BinaryCrossentropy())
similarity_model.fit(data_X_train, data_y_train, epochs=10)
print(similarity_model.evalutate(data_X_test, data_y_test))

### Convolutional Neural Networks
For image recognition problems. Convolutional Neural Networks represent more powerful solutions as they extract features out of pictures and use to classify images properly.