In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# for handling imbalancing
from imblearn.under_sampling import NearMiss
from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report , confusion_matrix

import tensorflow as tf
import cv2
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization
from keras.layers import Dropout

# for learning rate decay
from keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D,MaxPooling2D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import SGD

import warnings
warnings.filterwarnings('ignore')

# **Alphabet Recognizer**
# Problem Statement
The input training dataset contains 28x28 pixel 297960 sample handwritten alphabets. It is represented by 784 features(columns) in the input dataset. The input dataset as well contains the target alphabet value (as numbers from 0..25) as a separate column ( in addition to 784 features columns).

The ask is to train a model and use it to predict handwritten alphabets for a test data set having 74490 sample images.
An output csv file is generated for test data set predictions using the best configured model.



# Solution Steps
1. Read Data set
1. Examine Data set Properties
1. Inspecting the Dataframe
1. Data Preparation
    1. Copy Target Column(Alphabet) values and Remove from Training Data set
	1. Validate Training Sample size per Alphabet 
    1. Under sampling of Training dataset to balance class distribution
    1. Normalize Training dataset
    1. Reshape Training dataset for CNN Model
    1. Encode Target Variable
1. Model based on Dummy Encoded Data set 
    1. Test-Train Split
    1. Feature Scaling and Initial RFE
    1. Feature Selection Using RFE
        1. Model 1 Outcome
		1. Model 2 Outcome
		1. Model 3 Outcome
		1. Model 4 Outcome
		1. Model 5 Outcome
		1. Checking VIFs
		1. Model 6 Outcome
    1. Model Evaluation
		1. Prediction and Lead Score assignment
		1. Metrics based on Confusion Matrix
		1. ROC and AUC Metrics and Cut off selection
		1. Redo Prediction based on Selected Cut off
		1. Redo Metrics based on Confusion Matrix
		1. Metrics based on Precision and Recall
		1. Precision and recall tradeoff
		1. Analysis of Metrics
    1. Making predictions on the test set and Evaluation
    1. Analysis of Metrics
    1. Analysis of Selected Features
        1. Correlation Analysis
        1. Numeric Features
        1. Categorical Dummy Features
        1. View the Co-efficient of the Selected Model
        1. Top Positively influencing Features
        1. Top Negatively influencing Features
1. Model based on Frequency Encoded Data set 
    1. Test-Train Split
    1. Feature Scaling and Initial RFE
		1. Feature Selection Using RFE
        1. Model 7 Outcome
		1. Model 8 Outcome
		1. Checking VIFs
		1. Model 9 Outcome
		1. Model 10 Outcome
    1. Model Evaluation
		1. Prediction and Lead Score assignment
		1. Metrics based on Confusion Matrix
		1. ROC and AUC Metrics and Cut off selection
		1. Redo Prediction based on Selected Cut off
		1. Redo Metrics based on Confusion Matrix, Precision Recall
		1. Precision and recall tradeoff
		1. Analysis of Metrics
    1. Making predictions on the test set and Evaluation
    1. Analysis of Metrics
    1. Analysis of Selected Features
        1. Correlation Analysis
        1. View the Co-efficient of the Selected Model
        1. Top Positively influencing Features
        1. Top Negatively influencing Features

# Read Training Data set

In [None]:
df_train = pd.read_csv('/kaggle/input/alphabet/train.csv')

# Examine Training Dataset Properties

In [None]:
#find the rows x columns
df_train.shape

# Inspecting the Dataframe


In [None]:
#see the first 10 rows
df_train.head(10)

In [None]:
#see the last 10 rows
df_train.tail()

# Data Preparation

  ### Copy Target Column(Alphabet) values and Remove from Training Data set


In [None]:
#create target variable y for the alphabet dataset
y_train = df_train['0']

#remove target column from the df_alpha
del df_train['0']

In [None]:
# Rename values
alphabet_y_train = y_train.replace([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25], ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])


In [None]:
#view mapping 
alphabet_y_train

 ### Validate Training Sample size per Alphabet 

In [None]:
# Looking for imbalances in data
plt.figure(figsize = (10,5))
sns.displot(alphabet_y_train)

### Under sampling of Training dataset to balance class distribution

In [None]:
#undersample and balance samples per alphabet equally (~883 images per alphabet as outcome)
nM = NearMiss()
X_train_data, y_train_data = nM.fit_resample(df_train, y_train)

In [None]:
#visualize the distribution of the class after balancing
plt.figure(figsize = (10,5))
sns.displot(y_train_data)

In [None]:
#shape of the updated training dataset
y_train_data.shape , X_train_data.shape

###     Normalize Training dataset


In [None]:
#normalize the training data set values
X_train_data = X_train_data / 255
X_train_data

### Reshape Training dataset for CNN Model 

In [None]:
#reshape datset
X_train_data = np.array(X_train_data)
X_train_data = X_train_data.reshape(-1,28,28,1)


In [None]:
# view some of the handwritten alphabet images
f, ax = plt.subplots(5,5)
f.set_size_inches(10,10)
k = 0
for i in range(5):
    for j in range(5):
        ax[i,j].imshow(X_train_data[k].reshape(28,28), cmap='gray')
        k += 1
    plt.tight_layout()

### Encode Target Variable

In [None]:
#One-Hot-Encoding of the target.
y = np_utils.to_categorical(y_train_data)
# Define the classification of 26 alphabets.
num_classes = y.shape[1]
num_classes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y, test_size=0.2 ,random_state=102)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
#Build an ordinary "Deep Learning" model with CNN and maxpooling by using Keras.
model = Sequential()
model.add(Conv2D(32, (5, 5), input_shape=(28, 28, 1), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
#Choose an optimizer and compile the model.
model.compile(optimizer = Adam(learning_rate = 0.01), loss = 'categorical_crossentropy', metrics = ['accuracy'])
#And print the summary of the model.
print(model.summary())

In [None]:
def model_fit_history_plot(history):
    plt.figure(1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['training','validation'])
    plt.title('Loss')
    plt.xlabel('epoch')
    plt.figure(2)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.legend(['training','validation'])
    plt.title('Accuracy')
    plt.xlabel('epoch')
    plt.show()
    return None

In [None]:
#fit the defined model for training / validation data sets
history = model.fit(X_train,y_train,epochs=15, batch_size=128, validation_data=(X_test,y_test))



In [None]:
# Final evaluation of the model
scores = model.evaluate(X_test,y_test, verbose=0)
print("CNN Error: %.2f%%" % (100-scores[1]*100))

#Plot fit history
model_fit_history_plot(history)

In [None]:
# Grid Search to determine the layers and neurons in each layer in the sequential model.
def create_model(layers):
    cnn_model = tf.keras.models.Sequential()
    cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu", input_shape=[28, 28, 1]))
    cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
    cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu"))
    cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
    cnn_model.add(tf.keras.layers.Flatten())
    
    for i, nodes in enumerate(layers):
        cnn_model.add(tf.keras.layers.Dense(units=nodes, activation='relu'))
            
    cnn_model.add(tf.keras.layers.Dense(units=26, activation='softmax'))
    
    cnn_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return cnn_model

model = KerasClassifier(build_fn=create_model, verbose=1)
layers = [[128],(256, 128),(200, 150, 120)]
param_grid = dict(layers=layers)
grid = GridSearchCV(estimator=model, param_grid=param_grid, verbose=1)
grid_results = grid.fit(X_train,y_train, validation_data=(X_test, y_test))
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))
best_layer_size=grid_results.best_params_['layers']  
    

In [None]:
#Grid Search to determine the batch size
def create_model1():
    cnn_model = tf.keras.models.Sequential()
    cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu", input_shape=[28, 28, 1]))
    cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
    cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu"))
    cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
    cnn_model.add(tf.keras.layers.Flatten())
    cnn_model.add(tf.keras.layers.Dense(units=256, activation='relu'))
    cnn_model.add(tf.keras.layers.Dense(units=128, activation='relu'))
    cnn_model.add(tf.keras.layers.Dense(units=26, activation='softmax'))
    
    cnn_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return cnn_model

model = KerasClassifier(build_fn = create_model1, verbose = 1)

batch_size = [15,20,40,50]
param_grid = dict(batch_size=batch_size)

grid = GridSearchCV(estimator = model, param_grid = param_grid, verbose = 1)
grid_results = grid.fit(X_train,y_train, validation_data=(X_test, y_test))

print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
params = grid_results.cv_results_['params']
for mean,param in zip(means,params):
    print('{0} with: {1}'.format(mean,param))
best_batch_size=grid_results.best_params_['batch_size']

In [None]:
#Grid Search to determine the dropout rate

def create_model2(dropout):
    # create model
    cnn_model = tf.keras.models.Sequential()
    cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu", input_shape=[28, 28, 1]))
    cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
    cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu"))
    cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
    cnn_model.add(tf.keras.layers.Flatten())
    cnn_model.add(tf.keras.layers.Dense(units=best_layer_size[0], activation='relu'))
    cnn_model.add(Dropout(dropout))
    cnn_model.add(tf.keras.layers.Dense(units=best_layer_size[1], activation='relu'))
    cnn_model.add(Dropout(dropout))
    cnn_model.add(tf.keras.layers.Dense(units=26, activation='softmax'))
    
    cnn_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return cnn_model

model = KerasClassifier(build_fn = create_model2, verbose = 1, batch_size=best_batch_size)

dropout = [0.0, 0.1, 0.2]
param_grid = dict(dropout=dropout)

grid = GridSearchCV(estimator = model, param_grid = param_grid, verbose = 1)
grid_results = grid.fit(X_train,y_train, validation_data=(X_test, y_test))

print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
params = grid_results.cv_results_['params']
for mean,param in zip(means,params):
    print('{0} with: {1}'.format(mean,param))
best_dropout_rate=grid_results.best_params_['dropout']

In [None]:
#Definition of the final CNN model

cnn_model = tf.keras.models.Sequential()
cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu", input_shape=[28, 28, 1]))
cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
cnn_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu"))
cnn_model.add(tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='valid'))
cnn_model.add(tf.keras.layers.Flatten())
cnn_model.add(tf.keras.layers.Dense(units=best_layer_size[0], activation='relu'))
cnn_model.add(Dropout(best_dropout_rate))
cnn_model.add(tf.keras.layers.Dense(units=best_layer_size[1], activation='relu'))
cnn_model.add(Dropout(best_dropout_rate))
cnn_model.add(tf.keras.layers.Dense(units=26, activation='softmax'))

# compile the model
cnn_model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

history = cnn_model.fit(X_train, y_train, batch_size=best_batch_size, epochs=20,validation_data=(X_test, y_test))


# Final evaluation of the model
scores = cnn_model.evaluate(X_test,y_test, verbose=0)
print("CNN Error: %.2f%%" % (100-scores[1]*100))

#Plot fit history
model_fit_history_plot(history)

In [None]:
df_test = pd.read_csv('/kaggle/input/alphabet/test.csv')

In [None]:
df_test.shape

In [None]:
df_test.head(10)

In [None]:
#find the rows x columns
df_test.columns

In [None]:
#check if the first column is target variable?
y_test = df_test['Unnamed: 0']

In [None]:
#first column is not representing the alphabet  and so in the test data set there is target variable column
#we can safely ignore this variable!
y_test.unique()

In [None]:
# Create the pandas DataFrame
#df_result = pd.DataFrame(data, columns=['ID', 'Prediction'])
#df_result['ID']= df_test['Unnamed: 0']

#remove first column from df_test
del df_test['Unnamed: 0'] 

In [None]:
#normalization - ADHARSH to test different techniques
X_test_data = df_test / 255
X_test_data

In [None]:
#convert to numpy array
X_test_data = np.array(X_test_data)
X_test_data = X_test_data.reshape(-1,28,28,1)
# Showing few images
f, ax = plt.subplots(5,5)
f.set_size_inches(10,10)
k = 0
for i in range(5):
    for j in range(5):
        ax[i,j].imshow(X_test_data[k].reshape(28,28), cmap='gray')
        k += 1
    plt.tight_layout()

In [None]:
#predict target variable (alphabet) using the CNN model
y_pred=cnn_model.predict(X_test_data)

In [None]:
y_pred.shape

y_pred has 26 columns for all the rows; each column represents the probability of a letter for that row. The next line of code find the the column that has the max value and then puts that column as  value into the predictions variable I think. Adharsh to validate!

In [None]:
#creating our predictions using the pixel values; #
# taking the largest number column value as the result from
predictions = np.argmax(y_pred,axis = 1)

In [None]:
#validate the shape of the predictions
predictions.shape

In [None]:
#convert numpy array to dataframe
df_predictions = pd.DataFrame(predictions, columns = ['alphabet'])

In [None]:
#replace numbers with alphabet
alphabet_y_test = df_predictions.replace([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25], ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])

In [None]:
#view the predictions
alphabet_y_test

In [None]:
#save to CSV Result
alphabet_y_test.to_csv('TestDataset_Prediction.csv')