In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pprint

# Visualization
import matplotlib.pyplot as plt

# Preprocessing data
from sklearn.preprocessing import StandardScaler

# Neural Networks
import keras
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from sklearn.model_selection import KFold

In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
data_train = pd.read_csv("/kaggle/input/sign-language-mnist/sign_mnist_train.csv")
data_test = pd.read_csv("/kaggle/input/sign-language-mnist/sign_mnist_test.csv")

In [None]:
X_train = data_train.drop(columns=['label'])
y_train = data_train['label']

X_test = data_test.drop(columns=['label'])
y_test = data_test['label']

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
def number_to_letter(number):
    return chr(ord('A')+number)

In [None]:
sample_images = data_train.groupby('label', group_keys=False).apply(lambda df: df.sample(1))
labels = sample_images['label']
images = sample_images.drop(columns=['label'])

In [None]:
rows = 4
columns = 6
fig, axs = plt.subplots(rows,columns, figsize=(12,12))

for i in range(rows):
    for j in range(columns):
        image_array = images.iloc[i*columns+j].values.reshape(28,28)
        axs[i][j].imshow(image_array, cmap='gray')
        axs[i][j].set_title(number_to_letter(labels.iloc[i*columns+j]))
        axs[i][j].axis('off')

# **Preprocess data**

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Reshape image (h=28px, w=28px, canal=1)
X_train_reshaped = X_train_scaled.reshape(-1,28,28,1)
X_test_reshaped = X_test_scaled.reshape(-1,28,28,1)

In [None]:
# WARNING: the actual num_classes should be 24, but class 9 is missing.
# Therefore the number of classes was increased to 25.
num_classes = 25
y_train_categorical = to_categorical(y_train, num_classes)
y_test_categorical = to_categorical(y_test, num_classes)

# **Machine Learning Model - Convolutional Neural Network**

In [None]:
def cnn_algorithm_test(my_epochs=10):
    epochs = my_epochs
    batch_size = 1000
    my_dropout=0.25
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3,3),
                 activation='relu',
                 input_shape=(28,28,1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(my_dropout))
    
    model.add(Conv2D(64, kernel_size=(3,3) ,activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(my_dropout))
    
    #model.add(Conv2D(300, kernel_size=(3,3) ,activation='relu'))
    #model.add(MaxPooling2D(pool_size=(2, 2)))
    #model.add(Dropout(my_dropout))
    
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    #model.add(Dropout(my_dropout))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
    
    history = model.fit(X_train_reshaped, y_train_categorical,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1)
    
    predictions = model.predict(X_test_reshaped)
    score = model.evaluate(X_test_reshaped, y_test_categorical, verbose=0)
    print('Test loss:{}, accuracy:{}'.format(score[0], score[1]))
    return history, score[0], score[1]

In [None]:
def cnn_algorithm(X,y,dropout,max_pooling,conv2d,batch_size=1000,epochs=20): # tinha x e y nos argumentos
        # Normalization
        scaler = StandardScaler()

        scaler.fit(X)
        X_train_scaled = scaler.transform(X)

        # Reshape image (h=28px, w=28px, canal=1)
        X_train_reshaped = X_train_scaled.reshape(-1, 28, 28, 1)

        num_classes = 25
        # one hot encode our y variables
        y_train_categorical = to_categorical(y, num_classes)
        
        model = Sequential()
        model.add(Conv2D(conv2d, kernel_size=(3, 3),
                         activation='relu',
                         input_shape=(28,28,1)))
        model.add(MaxPooling2D(pool_size=(max_pooling, max_pooling)))
        model.add(Dropout(dropout))
        model.add(Conv2D(filters=conv2d,
                              kernel_size=6,
                              activation='relu'))
        model.add(MaxPooling2D(pool_size=(max_pooling, max_pooling)))
        model.add(Dropout(dropout))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(dropout))
        model.add(Dense(num_classes, activation='softmax'))

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=keras.optimizers.Adadelta(),
                      metrics=['accuracy'])

        history = model.fit(X_train_reshaped, y_train_categorical,
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=1)
        
        predictions = model.predict(X_test_reshaped)
        score = model.evaluate(X_test_reshaped, y_test_categorical, verbose=0)
        #print('Test loss:{}, accuracy:{}'.format(score[0], score[1]))
        # eturn history, score[0], score[1]
        return history,model,scaler

In [None]:
def test_several_dropout():
        DROPOUTS = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4 , 0.45, 0.5]
        results = []
        for d in DROPOUTS:    
            _, l, res = cnn_algorithm(d)
            results.append(res)
        for i in range(len(results)):
            print("Acc: {}  Drop: {}".format(results[i], DROPOUTS[i]))
        plt.plot(DROPOUTS, results)
        plt.show()
        
        
def test_several_epochs():
        EPOCHS = [10, 20, 30,40,50,60, 80]
        combination = [(0.3, 312, 3)]
        
        my_acc = []
        my_loss = []
        mean_acc = []
        COUNT = 5
        for e in EPOCHS:   
            for i in range(COUNT):
                _,_,a=cnn_algorithm(dropout=combination[0][0],max_pooling=combination[0][2],conv2d=combination[0][1], epochs=e)
                my_acc.append(a)
            mean_acc.append(np.mean(my_acc))
            my_acc = []
        plt.plot(EPOCHS,mean_acc, label="accuracy")
        plt.title("Score across different epochs")
        plt.legend(["accuracy", "loss"])
        plt.show()
        
        
def test_several_batchsize():
        BATCHSIZE = [100, 1000, 3000,5000,10000]
        my_acc = []
        my_loss = []
        for b in BATCHSIZE:    
            _,l,a = cnn_algorithm(b)
            my_acc.append(a)
            my_loss.append(l)
        plt.plot(BATCHSIZE,my_acc, label="accuracy")
        plt.plot(BATCHSIZE,my_loss, label="loss")
        plt.title("Score across different batch sizes")
        plt.legend(["accuracy", "loss"])
        plt.show()
        
        

In [None]:
def generateKFolds(X, y, realTestX, realTestY, splits, examples=[300], combinations=[]):
        # Initialize data
        validationScores={i:[] for i in combinations}
        trainScores={i:[] for i in combinations}
        testScores={i:[] for i in combinations}
        for training_samples in examples:
            kfold = KFold(n_splits=splits)
            for dropout,conv2d,max_pooling in combinations:
                for train_index, test_index in kfold.split(X[:training_samples]):
                    print(f"Train index {train_index}")
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    #history,model,scaler=self.cnn_algorithm(X=X_train,y=y_train,epochs=10,batch_size=1000)
                    history,model,scaler=cnn_algorithm(X=X_train,y=y_train,dropout=dropout,max_pooling=max_pooling,conv2d=conv2d)

                    X_test_scaled = scaler.transform(X_test)
                    X_test_reshaped = X_test_scaled.reshape(-1,28,28,1)
                    y_test_categorical = to_categorical(y_test,25) #Need to reshape to 25 because there are 2 classes missing

                    trainScores[(dropout,conv2d,max_pooling)].append(history.history["accuracy"][-1])
                    validationScores[(dropout,conv2d,max_pooling)].append(model.evaluate(X_test_reshaped,y_test_categorical)[1])

                history,model,scaler=cnn_algorithm(X=X,y=y,dropout=dropout,max_pooling=max_pooling,conv2d=conv2d)
                testScores[(dropout,conv2d,max_pooling)].append(model.evaluate(realTestX, realTestY)[1])

        #print(f"Validation scores (Accuracy) {validationScores}")
        #print(f"Train scores (Accuracy){trainScores}")
        mean_val_sample = {i:np.mean(validationScores[i]) for i in validationScores}
        mean_train_sample = {i:np.mean(trainScores[i]) for i in trainScores}
        mean_per_val_sample = [np.mean(validationScores[i]) for i in validationScores]
        mean_per_train_sample = [np.mean(trainScores[i]) for i in trainScores]
        #print(mean_val_sample)
        #print(mean_train_sample)
        x_values = [str(i[:]) for i in combinations]
        plt.figure(num=1,figsize=(8,10),dpi=1200)
        plt.rcParams.update({'font.size':12})
        plt.plot(x_values,mean_per_val_sample,label="validation accuracy")
        plt.plot(x_values,mean_per_train_sample,label="train accuracy",linestyle="-.")
        plt.plot(x_values,list(testScores.values()),label="test accuracy",linestyle="--")
        plt.xlabel("Training examples (Dropout,Max Pooling,Conv2d) ")
        plt.xticks(rotation=90)
        plt.ylabel("Accuracy")
        plt.ylim([0, 1.05])
        plt.legend(loc='lower right')
        plt.title(f"K-Fold with {splits} splits")
        plt.show()
        return testScores,trainScores,validationScores

In [None]:
#history, score_zero, score_um = cnn_algorithm()
#test_several_dropout()
#test_several_batchsize()
#test_several_epochs()
dropout=[0,0.2,0.3,0.4,0.5]
conv2d=[32,124,312]
maxpooling=[2,3]
combinations=[(d,c,m) for d in dropout for c in conv2d for m in maxpooling]
generateKFolds(X_train.values, y_train.values,X_test_reshaped,y_test_categorical,splits=5,examples=[27455],combinations=combinations)

In [None]:
history,model,scaler=cnn_algorithm(X=X_train.values,y=y_train.values,dropout=0.4,max_pooling=3,conv2d=312)
predictions = model.predict(X_test_reshaped)


true_values=[y_test[i] for i in range(len(y_test))]
pred_values=model.predict_classes(X_test_reshaped)
confusion=confusion_matrix(true_values,pred_values)
letters=['A','B','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y']
ignore_zeros=[count_test_labels[i] for i in letters]
confusion2=[list(np.divide(confusion[i],ignore_zeros[i])) for i in range(len(confusion))]
df_cm = pd.DataFrame(confusion2, letters, letters)
plt.figure(figsize=(20,20))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm,annot=True, annot_kws={"size": 10},fmt=".3g") # font size
plt.xlabel('Predicted')
plt.ylabel('Real Value')
plt.title('Confusion Matrix')
plt.show()

In [None]:
plt.plot(history.history["loss"], label="loss")
plt.xlabel("Epochs")
plt.ylabel("$J(\Theta)$")
plt.title("Cost Function")
plt.show()

In [None]:

preds_labels = pd.Series([p.argmax() for p in predictions])
dict_pred_labels=preds_labels.to_dict()
# Series containing only the mislabeled elements
failed_preds = {i:dict_pred_labels[i] for i in dict_pred_labels if dict_pred_labels[i]!=y_test[i]}
count={}
print(len(failed_preds))
for i in failed_preds:
    if number_to_letter(y_test[i]) in count:
        if number_to_letter(failed_preds[i]) in count[number_to_letter(y_test[i])]:
            count[number_to_letter(y_test[i])][number_to_letter(failed_preds[i])]+=1
        else:
            count[number_to_letter(y_test[i])]=dict(count[number_to_letter(y_test[i])],**{number_to_letter(failed_preds[i]):1})
    else:
        count[number_to_letter(y_test[i])]={number_to_letter(failed_preds[i]):1}

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(count)

In [None]:
preds_labels = pd.Series([p.argmax() for p in predictions])

# Series containing only the mislabeled elements
failed_preds = preds_labels[preds_labels != y_test]

In [None]:
rows = 4
columns = 6
fig, axs = plt.subplots(rows,columns, figsize=(12,12))

for i in range(rows):
    for j in range(columns):
        index = failed_preds.index[i*columns+j]
        image_array = X_test.loc[index].values.reshape(28,28)
        axs[i][j].imshow(image_array, cmap='gray')
        axs[i][j].set_title('Label: {}\n Predicted: {}'.format(number_to_letter(y_test.loc[index]),
                                                               number_to_letter(predictions[index].argmax())))
        axs[i][j].axis('off')