In [1]:
# imports for array-handling and plotting and time
import numpy as np
import time

import matplotlib
import matplotlib.pyplot as plt

# keras imports for the dataset and building neural network
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Conv2D, Flatten, MaxPooling2D
from keras.utils import np_utils, to_categorical
from keras.wrappers.scikit_learn import KerasClassifier

# sklearn imports for model selection, data preparation and classifiers
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

images = np.loadtxt("handwritten_digits_images.csv", delimiter=',')
labels = np.loadtxt("handwritten_digits_labels.csv", delimiter=',')

# Making sure that the values are float so that we can get decimal points after division
images = images.astype('float32')

# Normalizing the RGB codes by dividing it to the max RGB value.
images /= 255

X_train, X_test, y_train, y_test = train_test_split(images, labels,test_size=0.3, random_state=32)

#When we are not wrapping our nn into a KerasClassifier we need hot encoded y 
y_train_enc = np_utils.to_categorical(y_train, 10)
y_test_enc = np_utils.to_categorical(y_test, 10)

images1 = images[1::4]
labels1 = labels[1::4]

pca = IncrementalPCA(n_components=40, batch_size=100)
images_pca = pca.fit_transform(images1)

X_train3, X_test3, y_train3, y_test3 = train_test_split(images_pca, labels1,test_size=0.3, random_state=32)

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [2]:
print()
print("---------------------- 1HiddenLayer NN --------------------------------------")
print()

def make_model1():
    
    model = Sequential()
    model.add(Dense(512, input_shape=(784,)))
    model.add(Activation('relu'))                            
    model.add(Dropout(0.2))
    model.add(Dense(10))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')    
    
    return model


def tuning1(X_train,Y_train,X_test,Y_test):

    batch_size = [50, 80, 100, 128, 135, 150]
    epochs = [15,20,25]
    param_grid = dict(batch_size=batch_size, nb_epoch=epochs)

    k_model = KerasClassifier(build_fn=make_model1, verbose=0)
   
    clf = GridSearchCV(estimator=k_model, param_grid=param_grid, 
                                   cv=3,
                                   scoring="accuracy", verbose=0 ,n_jobs=-1)
    clf.fit(X_train,Y_train)
    
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    
    model=make_model1()
    
    return clf,model
  
tuning1_start = time.time()    
clf1 = tuning1(X_train,y_train,X_test,y_test)
tuning1_end = time.time()
print()
model1 = clf1[1]

fit1_start=time.time()
history1 = model1.fit(X_train,y_train_enc, batch_size=50, epochs=15 ,verbose=2,validation_data=(X_test, y_test_enc))
fit1_end=time.time()

#Evaluating the model
loss1, accuracy1 = model1.evaluate(X_test, y_test_enc, verbose=2)

print()
print("Test Loss : ", loss1)
print("Test Accuracy : ", round(accuracy1*100, 2), "%")
print("Cross Validation Time : ", round(tuning1_end-tuning1_start, 2), "sec" )
print("Training Time : ",  round(fit1_end-fit1_start, 2), "sec"  )


---------------------- 1HiddenLayer NN --------------------------------------

Best parameters set found on development set:

{'batch_size': 50, 'nb_epoch': 20}

Train on 49000 samples, validate on 21000 samples
Epoch 1/15
 - 5s - loss: 0.2508 - acc: 0.9270 - val_loss: 0.1417 - val_acc: 0.9597
Epoch 2/15
 - 5s - loss: 0.1098 - acc: 0.9669 - val_loss: 0.1038 - val_acc: 0.9683
Epoch 3/15
 - 5s - loss: 0.0766 - acc: 0.9766 - val_loss: 0.0828 - val_acc: 0.9750
Epoch 4/15
 - 5s - loss: 0.0585 - acc: 0.9816 - val_loss: 0.0890 - val_acc: 0.9733
Epoch 5/15
 - 5s - loss: 0.0442 - acc: 0.9860 - val_loss: 0.0756 - val_acc: 0.9778
Epoch 6/15
 - 5s - loss: 0.0360 - acc: 0.9883 - val_loss: 0.0812 - val_acc: 0.9755
Epoch 7/15
 - 5s - loss: 0.0311 - acc: 0.9900 - val_loss: 0.0740 - val_acc: 0.9780
Epoch 8/15
 - 5s - loss: 0.0260 - acc: 0.9918 - val_loss: 0.0722 - val_acc: 0.9789
Epoch 9/15
 - 5s - loss: 0.0252 - acc: 0.9914 - val_loss: 0.0745 - val_acc: 0.9805
Epoch 10/15
 - 5s - loss: 0.0192 - acc: 

In [2]:
print()
print("---------------------- Convolutional NN --------------------------------------")
print()

#reshape data to fit model
X_train_conv = X_train.reshape(49000,28,28,1)
X_test_conv = X_test.reshape(21000,28,28,1)

def make_model2():
    
    model = Sequential()

    model.add(Conv2D(10,kernel_size=5, activation='relu', input_shape=(28,28,1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(20, kernel_size=5, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])   
    
    return model



def tuning2(X_train,Y_train,X_test,Y_test):

    
    epochs = list(range(1,5))
    param_grid = dict(nb_epoch=epochs)

    k_model = KerasClassifier(build_fn=make_model2, verbose=0)
   
    clf = GridSearchCV(estimator=k_model, param_grid=param_grid, 
                                   cv=5,
                                   scoring="accuracy", verbose=0 ,n_jobs=-1)
    clf.fit(X_train,Y_train)
    
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    
    model=make_model2()
    
    return clf,model
    
tuning2_start = time.time()    
clf2 = tuning2(X_train_conv,y_train,X_test_conv,y_test)
tuning2_end = time.time()
print()
model2 = clf2[1]

fit2_start=time.time()
history2 = model2.fit(X_train_conv, y_train_enc, validation_data=(X_test_conv, y_test_enc), epochs=3)
fit2_end=time.time()

#Evaluating the model
loss2, accuracy2 = model2.evaluate(X_test_conv, y_test_enc, verbose=2)

print()
print("Test Loss : ", loss2)
print("Test Accuracy : ", round(accuracy2*100, 2), "%")
print("Cross Validation Time : ", round(tuning2_end-tuning2_start, 2), "sec" )
print("Training Time : ",  round(fit2_end-fit2_start, 2), "sec"  )

predicted_classes = model2.predict_classes(X_test_conv)

# see which we predicted correctly and which not ----- Indices of elements that are non-zero.
correct_indices = np.nonzero(predicted_classes == y_test)[0]
incorrect_indices = np.nonzero(predicted_classes != y_test)[0]

print()
print(len(correct_indices)," classified correctly")
print(len(incorrect_indices)," classified incorrectly")

error_rate = len(incorrect_indices)/(len(correct_indices)+len(incorrect_indices))

print()
print("Error rate : ", round(error_rate*100, 2), "%")


---------------------- Convolutional NN --------------------------------------

Best parameters set found on development set:

{'nb_epoch': 1}

Train on 49000 samples, validate on 21000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

Test Loss :  0.055206523373013454
Test Accuracy :  98.37 %
Cross Validation Time :  224.88 sec
Training Time :  46.01 sec

20657  classified correctly
343  classified incorrectly

Error rate :  1.63 %


In [5]:
print()
print("---------------------- K-NN Classifier --------------------------------------")
print()

KnnClassifier=KNeighborsClassifier()

grid_params = {"n_neighbors": range(1, 10)}

tuning3_start = time.time()    
grid_search = GridSearchCV(KnnClassifier, grid_params, verbose=0, n_jobs=-1)
tuning3_end = time.time()

fit3_start=time.time()
grid_search.fit(X_train3, y_train3)
fit3_end=time.time()

predicted = grid_search.predict(X_test3)
acc = accuracy_score(y_test3, predicted)

print()
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)

print()
print("Test Accuracy : ", round(acc*100, 2), "%")
print("Cross Validation Time : ", round(tuning3_end-tuning3_start, 2), "sec" )
print("Training Time : ",  round(fit3_end-fit3_start, 2), "sec"  )



---------------------- K-NN Classifier --------------------------------------


Best parameters set found on development set:

{'n_neighbors': 3}

Test Accuracy :  96.34 %
Cross Validation Time :  0.0 sec
Training Time :  42.37 sec


In [6]:
print()
print("---------------------- RandomForest Classifier --------------------------------------")
print()

rfc = RandomForestClassifier(n_jobs=-1)

grid_params1 = {"n_estimators": range(1, 20)}

tuning4_start = time.time()    
grid_search1 = GridSearchCV(rfc, grid_params1, verbose=0, n_jobs=-1)
tuning4_end = time.time()

fit4_start=time.time()
grid_search1.fit(X_train, y_train)
fit4_end=time.time()

predicted1 = grid_search1.predict(X_test)
acc1 = accuracy_score(y_test, predicted1)

print()
print("Best parameters set found on development set:")
print()
print(grid_search1.best_params_)

print()
print("Test Accuracy : ", round(acc1*100, 2), "%")
print("Cross Validation Time : ", round(tuning4_end-tuning4_start, 2), "sec" )
print("Training Time : ",  round(fit4_end-fit4_start, 2), "sec"  )


---------------------- RandomForest Classifier --------------------------------------


Best parameters set found on development set:

{'n_estimators': 19}

Test Accuracy :  95.57 %
Cross Validation Time :  0.0 sec
Training Time :  32.08 sec
