In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_dataset(feature_file, label_file):
    ''' Read data set in *.csv to data frame in Pandas'''
    df_X = pd.read_csv(feature_file)
    df_y = pd.read_csv(label_file)
    X = df_X.values 
    y = df_y.values 
    return X, y

def normalize_features(X_train, X_test):
    from sklearn.preprocessing import StandardScaler 
    scaler = StandardScaler() 
    scaler.fit(X_train) 
    X_train_norm = scaler.transform(X_train) 
    X_test_norm = scaler.transform(X_test) 
    return X_train_norm, X_test_norm

def one_hot_encoder(y_train, y_test):
    ''' convert label to a vector under one-hot-code fashion '''
    from sklearn import preprocessing
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    y_train_ohe = lb.transform(y_train)
    y_test_ohe = lb.transform(y_test)
    return y_train_ohe, y_test_ohe

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def softmax(z):
    exp_value = np.exp(z-np.amax(z, axis=1, keepdims=True))
    softmax_scores = exp_value / np.sum(exp_value, axis=1, keepdims=True)
    return softmax_scores

def accuracy(ypred, yexact):
    p = np.array(ypred == yexact, dtype = int)
    return np.sum(p)/float(len(yexact))

In [3]:
class NN:
    def __init__(self, X, y, layer1=100, layer2=100, lr=0.01):
        self.X = X 
        self.y = y 
        self.layer1 = layer1 
        self.layer2 = layer2 
        self.lr = lr 
        self.nn = X.shape[1] 
        self.W1 = np.random.randn(self.nn, self.layer1) / np.sqrt(self.nn)
        self.b1 = np.zeros((1, self.layer1)) 
        self.output = y.shape[1]
        self.W2 = np.random.randn(self.layer1, self.layer2) / np.sqrt(self.layer1)
        self.b2 = np.zeros((1, self.layer2))
        self.W3 = np.random.randn(self.layer2, self.output) / np.sqrt(self.layer2)
        self.b3 = np.zeros((1, self.output))
           
    def feed_forward(self):
        self.z1 = np.dot(self.X, self.W1) + self.b1
        #self.f1 = np.tanh(self.z1)
        self.f1 = sigmoid(self.z1)
        self.z2 = np.dot(self.f1, self.W2) + self.b2 
        #self.f2 = np.tanh(self.z2)
        self.f2 = sigmoid(self.z2)
        self.z3 = np.dot(self.f2, self.W3) + self.b3
        self.y_hat = softmax(self.z3)
        
    def back_propagation(self):
        d3 = self.y_hat - self.y
        dW3 = np.dot(self.f2.T, d3)
        db3 = np.sum(d3, axis=0, keepdims=True)
        d2 = np.dot(d3,self.W3.T)*self.f2*(1 - self.f2)
        dW2 = np.dot(self.f1.T, d2)
        db2 = np.sum(d2, axis=0, keepdims=True)
        d1 = np.dot(d2, self.W2.T) * self.f1* (1 - self.f1)
        dW1 = np.dot((self.X).T, d1) 
        db1 = np.sum(d1, axis=0, keepdims=True)
        
        self.W1 = self.W1 - self.lr * dW1
        self.b1 = self.b1 - self.lr * db1
        self.W2 = self.W2 - self.lr * dW2
        self.b2 = self.b2 - self.lr * db2
        self.W3 = self.W3 - self.lr * dW3
        self.b3 = self.b3 - self.lr * db3
        
    def cross_entropy_loss(self):
        self.feed_forward()
        self.loss = -np.sum(self.y*np.log(self.y_hat + 1e-6))
        
    def predict(self, X_test):
        z1 = np.dot(X_test, self.W1) + self.b1
        f1 = sigmoid(z1)
        z2 = np.dot(f1, self.W2) + self.b2
        f2 = sigmoid(z2)
        z3 = np.dot(f2, self.W3) + self.b3
        y_hat_test = softmax(z3)
        labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        num_test_samples = X_test.shape[0]
        ypred = np.zeros(num_test_samples, dtype=int) 
        for i in range(num_test_samples):
            ypred[i] = labels[np.argmax(y_hat_test[i,:])]
        return ypred

In [5]:
X_train, y_train = read_dataset('MNIST_X_train.csv', 'MNIST_y_train.csv')
X_test, y_test = read_dataset('MNIST_X_test.csv', 'MNIST_y_test.csv')
X_train_norm, X_test_norm = normalize_features(X_train, X_test)
y_train_ohe, y_test_ohe = one_hot_encoder(y_train, y_test)
myNN = NN(X_train_norm, y_train_ohe, 100, 100, lr=0.01)  
epoch_num = 1000
for i in range(epoch_num):
    myNN.feed_forward()
    myNN.back_propagation()
    myNN.cross_entropy_loss()
    if ((i+1)%20 == 0):
        print('epoch = %d, current loss = %.5f' % (i+1, myNN.loss))         
        
y_pred = myNN.predict(X_test_norm)
print('Accuracy of our model ', accuracy(y_pred, y_test.ravel()))



epoch = 20, current loss = 6061.61994
epoch = 40, current loss = 4549.82615
epoch = 60, current loss = 3775.27018
epoch = 80, current loss = 8046.30391
epoch = 100, current loss = 4591.64461
epoch = 120, current loss = 2671.75361
epoch = 140, current loss = 2632.10837
epoch = 160, current loss = 2525.22104
epoch = 180, current loss = 2632.67727
epoch = 200, current loss = 2375.13687
epoch = 220, current loss = 2510.53374
epoch = 240, current loss = 2447.49676
epoch = 260, current loss = 2384.02595
epoch = 280, current loss = 2393.59023
epoch = 300, current loss = 2093.39354
epoch = 320, current loss = 1484.43318
epoch = 340, current loss = 1334.79918
epoch = 360, current loss = 1280.36206
epoch = 380, current loss = 867.40875
epoch = 400, current loss = 328.95605
epoch = 420, current loss = 258.20601
epoch = 440, current loss = 223.79846
epoch = 460, current loss = 208.91001
epoch = 480, current loss = 200.89485
epoch = 500, current loss = 194.97090
epoch = 520, current loss = 185.3889