In [1]:
# Data
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import impl.layer as l

# Dataset preparation and pre-processing
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)

X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
# y_test.shape, y_val.shape, y_train.shape
# X_train.shape, X_train.dtype, X_val.shape, X_val.dtype, X_test.shape, X_test.dtype

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
# Pre-processing: normalizing
def normalize(X):
    # max scale for images 255= 2**8= 8 bit grayscale for each channel
    return (X - X.mean(axis=0)) #/ X.std(axis=0)

X_train, X_val, X_test = normalize(X=X_train), normalize(X=X_val), normalize(X=X_test)

In [None]:
# Model
import impl.layer as l # or from impl.layer import *
from impl.loss import * # import all functions from impl.loss file # import impl.loss as loss_func
from sklearn.utils import shuffle as skshuffle

class FFNN:

    def __init__(self, D, C, H, L):
        self.L = L # number of layers or depth
        self.losses = {'train':[], 'valid':[], 'valid_acc':[]}
        
        # The adaptive/learnable/updatable random feedforward
        self.model = []
        self.W_fixed = []
        self.grads = []
        self.ys_prev = []
        low, high = -1, 1
        
        # Input layer: weights/ biases
        m = dict(W=np.random.uniform(size=(D, H), low=low, high=high) / np.sqrt(D / 2.), b=np.zeros((1, H)))
        self.model.append(m)
        # Fixed feedback weight
        m = np.random.uniform(size=(D, H), low=low, high=high) / np.sqrt(D / 2.)
        self.W_fixed.append(m)
        # Input layer: gradients
        self.grads.append({key: np.zeros_like(val) for key, val in self.model[0].items()})
        # Previous output layer
        self.ys_prev.append(0.0)

        # Hidden layers: weights/ biases
        m_L = []
        for _ in range(L):
            m = dict(W=np.random.uniform(size=(H, H), low=low, high=high) / np.sqrt(H / 2.), b=np.zeros((1, H)))
            m_L.append(m)
        self.model.append(m_L)
        # Fixed feedback weight
        m_L = []
        for _ in range(L):
            m = np.random.uniform(size=(H, H), low=low, high=high) / np.sqrt(H / 2.)
            m_L.append(m)
        self.W_fixed.append(m_L)
        # Hidden layer: gradients
        grad_L = []
        for _ in range(L):
            grad_L.append({key: np.zeros_like(val) for key, val in self.model[1][0].items()})
        self.grads.append(grad_L)
        # Previous output layer
        ys_prev_L = []
        for _ in range(L):
            ys_prev_L.append(0.0)
        self.ys_prev.append(ys_prev_L)
        
        # Output layer: weights/ biases
        m = dict(W=np.random.uniform(size=(H, C), low=low, high=high) / np.sqrt(H / 2.), b=np.zeros((1, C)))
        self.model.append(m)
        # Fixed feedback weight
        m = np.random.uniform(size=(H, C), low=low, high=high) / np.sqrt(H / 2.)
        self.W_fixed.append(m)
        # Outout layer: gradients
        self.grads.append({key: np.zeros_like(val) for key, val in self.model[2].items()})
        # Previous output layer
        self.ys_prev.append(0.0)
        
    def fc_forward(self, X, W, b):
        out = (X @ W) + b
        cache = (W, X)
        return out, cache

    def fc_backward(self, dout, cache, W_fixed):
        W, X = cache

        dW = X.T @ dout
        db = np.sum(dout, axis=0).reshape(1, -1) # db_1xn
        
        dX = dout @ W.T # Backprop
#         dX = dout @ W_fixed.T # fb alignment

        return dX, dW, db

    def train_forward(self, X, train):
        caches, ys = [], []
        
        # Input layer
        y, fc_cache = self.fc_forward(X=X, W=self.model[0]['W'], b=self.model[0]['b']) # X_1xD, y_1xc
        y, nl_cache = l.tanh_forward(X=y)
        if train:
            caches.append((fc_cache, nl_cache))
        ys.append(y) # ys[0]
        X = y.copy() # pass to the next layer
        
        # Hidden layers
        fc_caches, nl_caches, ys_L = [], [], []
        for layer in range(self.L):
            y, fc_cache = self.fc_forward(X=X, W=self.model[1][layer]['W'], b=self.model[1][layer]['b'])
            y, nl_cache = l.tanh_forward(X=y)
            ys_L.append(y) # ys[1][layer]
            X = y.copy() # pass to next layer
            if train:
                fc_caches.append(fc_cache)
                nl_caches.append(nl_cache)
        if train:
            caches.append((fc_caches, nl_caches)) # caches[1]            
        ys.append(ys_L) # ys[1]            
        
        # Output layer
        y, fc_cache = self.fc_forward(X=X, W=self.model[2]['W'], b=self.model[2]['b'])
        if train:
            caches.append(fc_cache)
        ys.append(y) # ys[2]

        return ys, caches # for backpropating the error

    def loss_function(self, y, y_train):
        
        loss = cross_entropy(y, y_train) # softmax is included
        dy = dcross_entropy(y, y_train) # dsoftmax is included
        
        return loss, dy
        
    def train_backward(self, dy, caches, ys):
        grads, ys_prev = self.grads, self.ys_prev # initialized by Zero in every iteration/epoch
        
        # Output layer
        fc_cache = caches[2]
        dX, dW, db = self.fc_backward(dout=dy, cache=fc_cache, W_fixed=self.W_fixed[2])
        dy = dX.copy()
        grads[2]['W'] = dW
        grads[2]['b'] = db

        # Hidden layer
        fc_caches, nl_caches = caches[1]
        for layer in reversed(range(self.L)):
#             dy *= ys[1][layer] - ys_prev[1][layer] # temporal diff instead of differentiable function
            dy = l.tanh_backward(cache=nl_caches[layer], dout=dy) # diffable function
            dX, dW, db = self.fc_backward(dout=dy, cache=fc_caches[layer], W_fixed=self.W_fixed[1][layer])
            dy = dX.copy()
            grads[1][layer]['W'] = dW
            grads[1][layer]['b'] = db
        
        # Input layer
        fc_cache, nl_cache = caches[0]
#         dy *= ys[0] - ys_prev[0] # temporal diff instead of differentiable function
        dy = l.tanh_backward(cache=nl_cache, dout=dy) # diffable function
        dX, dW, db = self.fc_backward(dout=dy, cache=fc_cache, W_fixed=self.W_fixed[0])
        grads[0]['W'] = dW
        grads[0]['b'] = db

        return dX, grads
    
    def test(self, X):
        ys_logit, _ = self.train_forward(X, train=False)
        y_logit = ys_logit[2] # last layer
        
        # if self.mode == 'classification':
        y_prob = l.softmax(y_logit) # for accuracy == acc
        y_pred = np.argmax(y_prob, axis=1) # for loss ==err
        
        return y_pred, y_logit
        
    def get_minibatch(self, X, y, minibatch_size, shuffle):
        minibatches = []

        if shuffle:
            X, y = skshuffle(X, y)

        for i in range(0, X.shape[0], minibatch_size):
            X_mini = X[i:i + minibatch_size]
            y_mini = y[i:i + minibatch_size]
            minibatches.append((X_mini, y_mini))

        return minibatches

    def sgd(self, train_set, val_set, alpha, mb_size, n_iter, print_after):
        X_train, y_train = train_set
        X_val, y_val = val_set

        # Epochs
        for iter in range(1, n_iter + 1):

            # Minibatches
            minibatches = self.get_minibatch(X_train, y_train, mb_size, shuffle=True)
            idx = np.random.randint(0, len(minibatches))
            X_mini, y_mini = minibatches[idx]
            
            # Train the model
            ys, caches = self.train_forward(X_mini, train=True)
            loss, dy = self.loss_function(ys[2], y_mini)
            _, grads = self.train_backward(dy, caches, ys) # ys[0], ys[1] and ys_prev are used for backprop
            self.ys_prev = ys # for next iteration or epoch learning dW and db
            self.losses['train'].append(loss)
            
            # Update the model for input layer
            for key in grads[0].keys():
                self.model[0][key] -= alpha * grads[0][key]

            # Update the model for the hidden layers
            for layer in range(self.L):
                for key in grads[1][layer].keys():
                    self.model[1][layer][key] -= alpha * grads[1][layer][key]

            # Update the model for output layer
            for key in grads[2].keys():
                self.model[2][key] -= alpha * grads[2][key]
                
            # Validate the updated model
            y_pred, y_logit = self.test(X_val)
            valid_loss, _ = self.loss_function(y_logit, y_val) # softmax is included in entropy loss function
            self.losses['valid'].append(valid_loss)
            valid_acc = np.mean(y_pred == y_val) # confusion matrix
            self.losses['valid_acc'].append(valid_acc)
            
            # Print the model info: loss & accuracy or err & acc
            if iter % print_after == 0:
                print('Iter-{} train loss: {:.4f} valid loss: {:.4f}, valid accuracy: {:.4f}'.format(
                    iter, loss, valid_loss, valid_acc))

        # Test the final model
        y_pred, y_logit = nn.test(X_test)
        loss, _ = self.loss_function(y_logit, y_test) # softmax is included in entropy loss function
        acc = np.mean(y_pred == y_test)
        print('Last iteration - Test accuracy mean: {:.4f}, std: {:.4f}, loss: {:.4f}'.format(
            acc.mean(), acc.std(), loss))

In [None]:
# Hyper-parameters
n_iter = 1000000 # number of epochs
alpha = 1e-3 # learning_rate
mb_size = 50 # 2**10==1024 # width, timestep for sequential data or minibatch size
print_after = 100 # n_iter//10 # print loss for train, valid, and test
num_hidden_units = 32 # number of kernels/ filters in each layer
num_input_units = X_train.shape[1] # noise added at the input lavel as input noise we can use dX or for more improvement
num_output_units = y_train.max() + 1 # number of classes in this classification problem
num_layers = 2 # depth 

# Build the model/NN and learn it: running session.
nn = FFNN(C=num_output_units, D=num_input_units, H=num_hidden_units, L=num_layers)

nn.sgd(train_set=(X_train, y_train), val_set=(X_val, y_val), mb_size=mb_size, alpha=alpha, 
           n_iter=n_iter, print_after=print_after)

Iter-100 train loss: 2.2804 valid loss: 2.2786, valid accuracy: 0.1362
Iter-200 train loss: 2.2655 valid loss: 2.2519, valid accuracy: 0.2030
Iter-300 train loss: 2.2429 valid loss: 2.2265, valid accuracy: 0.2796
Iter-400 train loss: 2.2063 valid loss: 2.2012, valid accuracy: 0.3504
Iter-500 train loss: 2.1563 valid loss: 2.1764, valid accuracy: 0.4120
Iter-600 train loss: 2.1500 valid loss: 2.1511, valid accuracy: 0.4618
Iter-700 train loss: 2.1393 valid loss: 2.1257, valid accuracy: 0.4950
Iter-800 train loss: 2.0970 valid loss: 2.1006, valid accuracy: 0.5164
Iter-900 train loss: 2.1108 valid loss: 2.0751, valid accuracy: 0.5380
Iter-1000 train loss: 2.0310 valid loss: 2.0497, valid accuracy: 0.5554
Iter-1100 train loss: 1.9877 valid loss: 2.0241, valid accuracy: 0.5708
Iter-1200 train loss: 2.0185 valid loss: 1.9984, valid accuracy: 0.5800
Iter-1300 train loss: 1.9692 valid loss: 1.9716, valid accuracy: 0.5858
Iter-1400 train loss: 1.9401 valid loss: 1.9451, valid accuracy: 0.5938
I

Iter-11500 train loss: 0.6516 valid loss: 0.6166, valid accuracy: 0.8666
Iter-11600 train loss: 0.4534 valid loss: 0.6127, valid accuracy: 0.8680
Iter-11700 train loss: 0.5392 valid loss: 0.6086, valid accuracy: 0.8688
Iter-11800 train loss: 0.5684 valid loss: 0.6048, valid accuracy: 0.8688
Iter-11900 train loss: 0.8121 valid loss: 0.6010, valid accuracy: 0.8696
Iter-12000 train loss: 0.4991 valid loss: 0.5974, valid accuracy: 0.8704
Iter-12100 train loss: 0.7544 valid loss: 0.5937, valid accuracy: 0.8716
Iter-12200 train loss: 0.4547 valid loss: 0.5900, valid accuracy: 0.8722
Iter-12300 train loss: 0.6180 valid loss: 0.5865, valid accuracy: 0.8728
Iter-12400 train loss: 0.6942 valid loss: 0.5829, valid accuracy: 0.8734
Iter-12500 train loss: 0.6241 valid loss: 0.5795, valid accuracy: 0.8734
Iter-12600 train loss: 0.5048 valid loss: 0.5763, valid accuracy: 0.8738
Iter-12700 train loss: 0.5702 valid loss: 0.5731, valid accuracy: 0.8750
Iter-12800 train loss: 0.6497 valid loss: 0.5700, v

Iter-22800 train loss: 0.3992 valid loss: 0.3953, valid accuracy: 0.9024
Iter-22900 train loss: 0.4634 valid loss: 0.3944, valid accuracy: 0.9024
Iter-23000 train loss: 0.2536 valid loss: 0.3933, valid accuracy: 0.9028
Iter-23100 train loss: 0.5096 valid loss: 0.3924, valid accuracy: 0.9030
Iter-23200 train loss: 0.3212 valid loss: 0.3914, valid accuracy: 0.9034
Iter-23300 train loss: 0.4637 valid loss: 0.3905, valid accuracy: 0.9034
Iter-23400 train loss: 0.4691 valid loss: 0.3896, valid accuracy: 0.9036
Iter-23500 train loss: 0.3599 valid loss: 0.3887, valid accuracy: 0.9038
Iter-23600 train loss: 0.4547 valid loss: 0.3880, valid accuracy: 0.9040
Iter-23700 train loss: 0.4513 valid loss: 0.3871, valid accuracy: 0.9040
Iter-23800 train loss: 0.4671 valid loss: 0.3862, valid accuracy: 0.9042
Iter-23900 train loss: 0.6746 valid loss: 0.3853, valid accuracy: 0.9042
Iter-24000 train loss: 0.4748 valid loss: 0.3845, valid accuracy: 0.9042
Iter-24100 train loss: 0.2986 valid loss: 0.3836, v

Iter-34100 train loss: 0.3561 valid loss: 0.3258, valid accuracy: 0.9150
Iter-34200 train loss: 0.4760 valid loss: 0.3254, valid accuracy: 0.9144
Iter-34300 train loss: 0.3188 valid loss: 0.3251, valid accuracy: 0.9150
Iter-34400 train loss: 0.3531 valid loss: 0.3246, valid accuracy: 0.9150
Iter-34500 train loss: 0.2715 valid loss: 0.3243, valid accuracy: 0.9150
Iter-34600 train loss: 0.2516 valid loss: 0.3238, valid accuracy: 0.9144
Iter-34700 train loss: 0.4583 valid loss: 0.3232, valid accuracy: 0.9150
Iter-34800 train loss: 0.4675 valid loss: 0.3228, valid accuracy: 0.9150
Iter-34900 train loss: 0.4368 valid loss: 0.3223, valid accuracy: 0.9150
Iter-35000 train loss: 0.1867 valid loss: 0.3219, valid accuracy: 0.9150
Iter-35100 train loss: 0.6904 valid loss: 0.3213, valid accuracy: 0.9152
Iter-35200 train loss: 0.2592 valid loss: 0.3210, valid accuracy: 0.9150
Iter-35300 train loss: 0.2343 valid loss: 0.3206, valid accuracy: 0.9148
Iter-35400 train loss: 0.2170 valid loss: 0.3202, v

Iter-45400 train loss: 0.2964 valid loss: 0.2900, valid accuracy: 0.9198
Iter-45500 train loss: 0.4177 valid loss: 0.2897, valid accuracy: 0.9202
Iter-45600 train loss: 0.3628 valid loss: 0.2893, valid accuracy: 0.9206
Iter-45700 train loss: 0.2704 valid loss: 0.2890, valid accuracy: 0.9210
Iter-45800 train loss: 0.1411 valid loss: 0.2886, valid accuracy: 0.9208
Iter-45900 train loss: 0.2418 valid loss: 0.2884, valid accuracy: 0.9206
Iter-46000 train loss: 0.2304 valid loss: 0.2882, valid accuracy: 0.9206
Iter-46100 train loss: 0.2615 valid loss: 0.2879, valid accuracy: 0.9212
Iter-46200 train loss: 0.2283 valid loss: 0.2877, valid accuracy: 0.9210
Iter-46300 train loss: 0.2478 valid loss: 0.2874, valid accuracy: 0.9208
Iter-46400 train loss: 0.1720 valid loss: 0.2872, valid accuracy: 0.9206
Iter-46500 train loss: 0.3317 valid loss: 0.2870, valid accuracy: 0.9212
Iter-46600 train loss: 0.2390 valid loss: 0.2868, valid accuracy: 0.9212
Iter-46700 train loss: 0.2671 valid loss: 0.2866, v

Iter-56700 train loss: 0.1984 valid loss: 0.2673, valid accuracy: 0.9236
Iter-56800 train loss: 0.4255 valid loss: 0.2670, valid accuracy: 0.9234
Iter-56900 train loss: 0.3010 valid loss: 0.2669, valid accuracy: 0.9236
Iter-57000 train loss: 0.3776 valid loss: 0.2665, valid accuracy: 0.9240
Iter-57100 train loss: 0.1984 valid loss: 0.2664, valid accuracy: 0.9240
Iter-57200 train loss: 0.1281 valid loss: 0.2661, valid accuracy: 0.9238
Iter-57300 train loss: 0.3770 valid loss: 0.2658, valid accuracy: 0.9238
Iter-57400 train loss: 0.4852 valid loss: 0.2655, valid accuracy: 0.9240
Iter-57500 train loss: 0.2615 valid loss: 0.2655, valid accuracy: 0.9240
Iter-57600 train loss: 0.4167 valid loss: 0.2653, valid accuracy: 0.9242
Iter-57700 train loss: 0.2713 valid loss: 0.2651, valid accuracy: 0.9242
Iter-57800 train loss: 0.1724 valid loss: 0.2647, valid accuracy: 0.9250
Iter-57900 train loss: 0.1785 valid loss: 0.2646, valid accuracy: 0.9250
Iter-58000 train loss: 0.1778 valid loss: 0.2645, v

Iter-68000 train loss: 0.1916 valid loss: 0.2500, valid accuracy: 0.9286
Iter-68100 train loss: 0.1848 valid loss: 0.2499, valid accuracy: 0.9288
Iter-68200 train loss: 0.3504 valid loss: 0.2497, valid accuracy: 0.9286
Iter-68300 train loss: 0.2249 valid loss: 0.2496, valid accuracy: 0.9284
Iter-68400 train loss: 0.3986 valid loss: 0.2495, valid accuracy: 0.9290
Iter-68500 train loss: 0.2056 valid loss: 0.2493, valid accuracy: 0.9286
Iter-68600 train loss: 0.1622 valid loss: 0.2493, valid accuracy: 0.9290
Iter-68700 train loss: 0.2824 valid loss: 0.2492, valid accuracy: 0.9290
Iter-68800 train loss: 0.2083 valid loss: 0.2491, valid accuracy: 0.9298
Iter-68900 train loss: 0.4529 valid loss: 0.2489, valid accuracy: 0.9294
Iter-69000 train loss: 0.1549 valid loss: 0.2487, valid accuracy: 0.9298
Iter-69100 train loss: 0.1501 valid loss: 0.2485, valid accuracy: 0.9298
Iter-69200 train loss: 0.2148 valid loss: 0.2484, valid accuracy: 0.9302
Iter-69300 train loss: 0.5649 valid loss: 0.2482, v

Iter-79300 train loss: 0.1372 valid loss: 0.2354, valid accuracy: 0.9340
Iter-79400 train loss: 0.3588 valid loss: 0.2352, valid accuracy: 0.9344
Iter-79500 train loss: 0.3088 valid loss: 0.2352, valid accuracy: 0.9338
Iter-79600 train loss: 0.2301 valid loss: 0.2350, valid accuracy: 0.9340
Iter-79700 train loss: 0.2164 valid loss: 0.2349, valid accuracy: 0.9346
Iter-79800 train loss: 0.1472 valid loss: 0.2347, valid accuracy: 0.9340
Iter-79900 train loss: 0.1934 valid loss: 0.2344, valid accuracy: 0.9338
Iter-80000 train loss: 0.1553 valid loss: 0.2343, valid accuracy: 0.9342
Iter-80100 train loss: 0.2015 valid loss: 0.2340, valid accuracy: 0.9338
Iter-80200 train loss: 0.3453 valid loss: 0.2339, valid accuracy: 0.9332
Iter-80300 train loss: 0.1315 valid loss: 0.2338, valid accuracy: 0.9330
Iter-80400 train loss: 0.1477 valid loss: 0.2337, valid accuracy: 0.9332
Iter-80500 train loss: 0.3344 valid loss: 0.2336, valid accuracy: 0.9338
Iter-80600 train loss: 0.0955 valid loss: 0.2336, v

Iter-90600 train loss: 0.2218 valid loss: 0.2227, valid accuracy: 0.9372
Iter-90700 train loss: 0.2285 valid loss: 0.2223, valid accuracy: 0.9372
Iter-90800 train loss: 0.1710 valid loss: 0.2221, valid accuracy: 0.9374
Iter-90900 train loss: 0.4328 valid loss: 0.2219, valid accuracy: 0.9376
Iter-91000 train loss: 0.1658 valid loss: 0.2219, valid accuracy: 0.9368
Iter-91100 train loss: 0.2803 valid loss: 0.2217, valid accuracy: 0.9378
Iter-91200 train loss: 0.4071 valid loss: 0.2217, valid accuracy: 0.9376
Iter-91300 train loss: 0.1912 valid loss: 0.2216, valid accuracy: 0.9376
Iter-91400 train loss: 0.2019 valid loss: 0.2217, valid accuracy: 0.9376
Iter-91500 train loss: 0.2482 valid loss: 0.2216, valid accuracy: 0.9374
Iter-91600 train loss: 0.3542 valid loss: 0.2215, valid accuracy: 0.9376
Iter-91700 train loss: 0.2624 valid loss: 0.2214, valid accuracy: 0.9376
Iter-91800 train loss: 0.2690 valid loss: 0.2211, valid accuracy: 0.9372
Iter-91900 train loss: 0.2570 valid loss: 0.2209, v

Iter-101800 train loss: 0.1938 valid loss: 0.2117, valid accuracy: 0.9396
Iter-101900 train loss: 0.2386 valid loss: 0.2115, valid accuracy: 0.9398
Iter-102000 train loss: 0.1832 valid loss: 0.2113, valid accuracy: 0.9396
Iter-102100 train loss: 0.0824 valid loss: 0.2111, valid accuracy: 0.9396
Iter-102200 train loss: 0.1542 valid loss: 0.2111, valid accuracy: 0.9402
Iter-102300 train loss: 0.1707 valid loss: 0.2110, valid accuracy: 0.9392
Iter-102400 train loss: 0.1801 valid loss: 0.2108, valid accuracy: 0.9392
Iter-102500 train loss: 0.1169 valid loss: 0.2108, valid accuracy: 0.9392
Iter-102600 train loss: 0.1142 valid loss: 0.2108, valid accuracy: 0.9390
Iter-102700 train loss: 0.1601 valid loss: 0.2106, valid accuracy: 0.9396
Iter-102800 train loss: 0.3211 valid loss: 0.2105, valid accuracy: 0.9398
Iter-102900 train loss: 0.3389 valid loss: 0.2104, valid accuracy: 0.9400
Iter-103000 train loss: 0.2731 valid loss: 0.2103, valid accuracy: 0.9402
Iter-103100 train loss: 0.0849 valid l

Iter-112900 train loss: 0.4564 valid loss: 0.2012, valid accuracy: 0.9426
Iter-113000 train loss: 0.4967 valid loss: 0.2010, valid accuracy: 0.9428
Iter-113100 train loss: 0.1377 valid loss: 0.2009, valid accuracy: 0.9440
Iter-113200 train loss: 0.3055 valid loss: 0.2009, valid accuracy: 0.9440
Iter-113300 train loss: 0.1605 valid loss: 0.2009, valid accuracy: 0.9436
Iter-113400 train loss: 0.1673 valid loss: 0.2008, valid accuracy: 0.9436
Iter-113500 train loss: 0.2564 valid loss: 0.2006, valid accuracy: 0.9434
Iter-113600 train loss: 0.2451 valid loss: 0.2005, valid accuracy: 0.9432
Iter-113700 train loss: 0.1611 valid loss: 0.2003, valid accuracy: 0.9432
Iter-113800 train loss: 0.1450 valid loss: 0.2003, valid accuracy: 0.9430
Iter-113900 train loss: 0.0742 valid loss: 0.2002, valid accuracy: 0.9428
Iter-114000 train loss: 0.1258 valid loss: 0.2001, valid accuracy: 0.9430
Iter-114100 train loss: 0.3354 valid loss: 0.2000, valid accuracy: 0.9428
Iter-114200 train loss: 0.1506 valid l

Iter-124000 train loss: 0.1318 valid loss: 0.1922, valid accuracy: 0.9450
Iter-124100 train loss: 0.2299 valid loss: 0.1922, valid accuracy: 0.9450
Iter-124200 train loss: 0.1174 valid loss: 0.1920, valid accuracy: 0.9450
Iter-124300 train loss: 0.1324 valid loss: 0.1917, valid accuracy: 0.9448
Iter-124400 train loss: 0.1726 valid loss: 0.1915, valid accuracy: 0.9450
Iter-124500 train loss: 0.1570 valid loss: 0.1917, valid accuracy: 0.9446
Iter-124600 train loss: 0.2693 valid loss: 0.1917, valid accuracy: 0.9446
Iter-124700 train loss: 0.0881 valid loss: 0.1917, valid accuracy: 0.9454
Iter-124800 train loss: 0.1445 valid loss: 0.1916, valid accuracy: 0.9456
Iter-124900 train loss: 0.1657 valid loss: 0.1915, valid accuracy: 0.9458
Iter-125000 train loss: 0.2291 valid loss: 0.1915, valid accuracy: 0.9456
Iter-125100 train loss: 0.2243 valid loss: 0.1913, valid accuracy: 0.9458
Iter-125200 train loss: 0.2478 valid loss: 0.1910, valid accuracy: 0.9458
Iter-125300 train loss: 0.0778 valid l

Iter-135100 train loss: 0.2098 valid loss: 0.1838, valid accuracy: 0.9470
Iter-135200 train loss: 0.0919 valid loss: 0.1835, valid accuracy: 0.9468
Iter-135300 train loss: 0.1891 valid loss: 0.1833, valid accuracy: 0.9470
Iter-135400 train loss: 0.2701 valid loss: 0.1832, valid accuracy: 0.9468
Iter-135500 train loss: 0.3155 valid loss: 0.1831, valid accuracy: 0.9472
Iter-135600 train loss: 0.0549 valid loss: 0.1830, valid accuracy: 0.9472
Iter-135700 train loss: 0.1721 valid loss: 0.1828, valid accuracy: 0.9476
Iter-135800 train loss: 0.2451 valid loss: 0.1827, valid accuracy: 0.9476
Iter-135900 train loss: 0.2764 valid loss: 0.1825, valid accuracy: 0.9474
Iter-136000 train loss: 0.1421 valid loss: 0.1825, valid accuracy: 0.9474
Iter-136100 train loss: 0.2110 valid loss: 0.1826, valid accuracy: 0.9470
Iter-136200 train loss: 0.1155 valid loss: 0.1826, valid accuracy: 0.9474
Iter-136300 train loss: 0.1140 valid loss: 0.1826, valid accuracy: 0.9478
Iter-136400 train loss: 0.1249 valid l

Iter-146200 train loss: 0.3554 valid loss: 0.1755, valid accuracy: 0.9490
Iter-146300 train loss: 0.2418 valid loss: 0.1756, valid accuracy: 0.9494
Iter-146400 train loss: 0.0669 valid loss: 0.1754, valid accuracy: 0.9492
Iter-146500 train loss: 0.4465 valid loss: 0.1752, valid accuracy: 0.9492
Iter-146600 train loss: 0.2368 valid loss: 0.1754, valid accuracy: 0.9494
Iter-146700 train loss: 0.2596 valid loss: 0.1754, valid accuracy: 0.9496
Iter-146800 train loss: 0.1094 valid loss: 0.1753, valid accuracy: 0.9496
Iter-146900 train loss: 0.3398 valid loss: 0.1753, valid accuracy: 0.9496
Iter-147000 train loss: 0.0995 valid loss: 0.1752, valid accuracy: 0.9496
Iter-147100 train loss: 0.1740 valid loss: 0.1751, valid accuracy: 0.9494
Iter-147200 train loss: 0.1453 valid loss: 0.1751, valid accuracy: 0.9496
Iter-147300 train loss: 0.2324 valid loss: 0.1748, valid accuracy: 0.9496
Iter-147400 train loss: 0.2156 valid loss: 0.1748, valid accuracy: 0.9500
Iter-147500 train loss: 0.1630 valid l

Iter-157300 train loss: 0.1324 valid loss: 0.1688, valid accuracy: 0.9512
Iter-157400 train loss: 0.4279 valid loss: 0.1686, valid accuracy: 0.9516
Iter-157500 train loss: 0.1747 valid loss: 0.1685, valid accuracy: 0.9518
Iter-157600 train loss: 0.1015 valid loss: 0.1684, valid accuracy: 0.9518
Iter-157700 train loss: 0.0802 valid loss: 0.1683, valid accuracy: 0.9516
Iter-157800 train loss: 0.0871 valid loss: 0.1680, valid accuracy: 0.9522
Iter-157900 train loss: 0.1630 valid loss: 0.1679, valid accuracy: 0.9516
Iter-158000 train loss: 0.1292 valid loss: 0.1679, valid accuracy: 0.9526
Iter-158100 train loss: 0.1004 valid loss: 0.1680, valid accuracy: 0.9518
Iter-158200 train loss: 0.1865 valid loss: 0.1679, valid accuracy: 0.9516
Iter-158300 train loss: 0.1554 valid loss: 0.1678, valid accuracy: 0.9518
Iter-158400 train loss: 0.2832 valid loss: 0.1677, valid accuracy: 0.9518
Iter-158500 train loss: 0.1290 valid loss: 0.1675, valid accuracy: 0.9516
Iter-158600 train loss: 0.2720 valid l

Iter-168400 train loss: 0.1993 valid loss: 0.1622, valid accuracy: 0.9532
Iter-168500 train loss: 0.0810 valid loss: 0.1621, valid accuracy: 0.9534
Iter-168600 train loss: 0.0652 valid loss: 0.1619, valid accuracy: 0.9534
Iter-168700 train loss: 0.1902 valid loss: 0.1619, valid accuracy: 0.9526
Iter-168800 train loss: 0.0872 valid loss: 0.1618, valid accuracy: 0.9528
Iter-168900 train loss: 0.1444 valid loss: 0.1618, valid accuracy: 0.9532
Iter-169000 train loss: 0.0451 valid loss: 0.1617, valid accuracy: 0.9532
Iter-169100 train loss: 0.1359 valid loss: 0.1617, valid accuracy: 0.9530
Iter-169200 train loss: 0.0818 valid loss: 0.1617, valid accuracy: 0.9532
Iter-169300 train loss: 0.1081 valid loss: 0.1617, valid accuracy: 0.9534
Iter-169400 train loss: 0.0793 valid loss: 0.1618, valid accuracy: 0.9536
Iter-169500 train loss: 0.0793 valid loss: 0.1617, valid accuracy: 0.9536
Iter-169600 train loss: 0.2264 valid loss: 0.1619, valid accuracy: 0.9534
Iter-169700 train loss: 0.1194 valid l

Iter-179500 train loss: 0.2002 valid loss: 0.1566, valid accuracy: 0.9544
Iter-179600 train loss: 0.1958 valid loss: 0.1566, valid accuracy: 0.9546
Iter-179700 train loss: 0.3421 valid loss: 0.1567, valid accuracy: 0.9546
Iter-179800 train loss: 0.3770 valid loss: 0.1567, valid accuracy: 0.9546
Iter-179900 train loss: 0.1662 valid loss: 0.1565, valid accuracy: 0.9544
Iter-180000 train loss: 0.1318 valid loss: 0.1564, valid accuracy: 0.9540
Iter-180100 train loss: 0.1138 valid loss: 0.1564, valid accuracy: 0.9544
Iter-180200 train loss: 0.2870 valid loss: 0.1565, valid accuracy: 0.9544
Iter-180300 train loss: 0.0521 valid loss: 0.1566, valid accuracy: 0.9546
Iter-180400 train loss: 0.0835 valid loss: 0.1569, valid accuracy: 0.9544
Iter-180500 train loss: 0.1989 valid loss: 0.1568, valid accuracy: 0.9548
Iter-180600 train loss: 0.1230 valid loss: 0.1566, valid accuracy: 0.9548
Iter-180700 train loss: 0.0752 valid loss: 0.1565, valid accuracy: 0.9546
Iter-180800 train loss: 0.0930 valid l

Iter-190600 train loss: 0.2149 valid loss: 0.1520, valid accuracy: 0.9560
Iter-190700 train loss: 0.0797 valid loss: 0.1520, valid accuracy: 0.9560
Iter-190800 train loss: 0.1158 valid loss: 0.1519, valid accuracy: 0.9558
Iter-190900 train loss: 0.2131 valid loss: 0.1519, valid accuracy: 0.9562
Iter-191000 train loss: 0.2392 valid loss: 0.1520, valid accuracy: 0.9560
Iter-191100 train loss: 0.1075 valid loss: 0.1518, valid accuracy: 0.9556
Iter-191200 train loss: 0.2618 valid loss: 0.1518, valid accuracy: 0.9560
Iter-191300 train loss: 0.1613 valid loss: 0.1517, valid accuracy: 0.9560
Iter-191400 train loss: 0.0416 valid loss: 0.1514, valid accuracy: 0.9566
Iter-191500 train loss: 0.1053 valid loss: 0.1514, valid accuracy: 0.9560
Iter-191600 train loss: 0.1890 valid loss: 0.1513, valid accuracy: 0.9570
Iter-191700 train loss: 0.1722 valid loss: 0.1510, valid accuracy: 0.9570
Iter-191800 train loss: 0.1563 valid loss: 0.1509, valid accuracy: 0.9570
Iter-191900 train loss: 0.0560 valid l

Iter-201700 train loss: 0.0834 valid loss: 0.1469, valid accuracy: 0.9580
Iter-201800 train loss: 0.1051 valid loss: 0.1469, valid accuracy: 0.9578
Iter-201900 train loss: 0.2135 valid loss: 0.1469, valid accuracy: 0.9576
Iter-202000 train loss: 0.0890 valid loss: 0.1467, valid accuracy: 0.9578
Iter-202100 train loss: 0.1140 valid loss: 0.1467, valid accuracy: 0.9576
Iter-202200 train loss: 0.2109 valid loss: 0.1466, valid accuracy: 0.9580
Iter-202300 train loss: 0.1851 valid loss: 0.1467, valid accuracy: 0.9578
Iter-202400 train loss: 0.0719 valid loss: 0.1465, valid accuracy: 0.9582
Iter-202500 train loss: 0.3053 valid loss: 0.1463, valid accuracy: 0.9586
Iter-202600 train loss: 0.1916 valid loss: 0.1463, valid accuracy: 0.9582
Iter-202700 train loss: 0.1242 valid loss: 0.1460, valid accuracy: 0.9578
Iter-202800 train loss: 0.1102 valid loss: 0.1462, valid accuracy: 0.9586
Iter-202900 train loss: 0.0899 valid loss: 0.1465, valid accuracy: 0.9584
Iter-203000 train loss: 0.0758 valid l

Iter-212800 train loss: 0.1176 valid loss: 0.1424, valid accuracy: 0.9584
Iter-212900 train loss: 0.1999 valid loss: 0.1423, valid accuracy: 0.9582
Iter-213000 train loss: 0.2411 valid loss: 0.1423, valid accuracy: 0.9584
Iter-213100 train loss: 0.0687 valid loss: 0.1421, valid accuracy: 0.9586
Iter-213200 train loss: 0.1520 valid loss: 0.1422, valid accuracy: 0.9580
Iter-213300 train loss: 0.2029 valid loss: 0.1422, valid accuracy: 0.9580
Iter-213400 train loss: 0.0714 valid loss: 0.1420, valid accuracy: 0.9582
Iter-213500 train loss: 0.1239 valid loss: 0.1420, valid accuracy: 0.9582
Iter-213600 train loss: 0.1086 valid loss: 0.1420, valid accuracy: 0.9584
Iter-213700 train loss: 0.1589 valid loss: 0.1420, valid accuracy: 0.9586
Iter-213800 train loss: 0.0512 valid loss: 0.1421, valid accuracy: 0.9588
Iter-213900 train loss: 0.0288 valid loss: 0.1421, valid accuracy: 0.9588
Iter-214000 train loss: 0.0639 valid loss: 0.1420, valid accuracy: 0.9584
Iter-214100 train loss: 0.3277 valid l

Iter-223900 train loss: 0.2121 valid loss: 0.1390, valid accuracy: 0.9592
Iter-224000 train loss: 0.1363 valid loss: 0.1389, valid accuracy: 0.9596
Iter-224100 train loss: 0.0577 valid loss: 0.1388, valid accuracy: 0.9598
Iter-224200 train loss: 0.0715 valid loss: 0.1389, valid accuracy: 0.9592
Iter-224300 train loss: 0.1439 valid loss: 0.1388, valid accuracy: 0.9596
Iter-224400 train loss: 0.0556 valid loss: 0.1390, valid accuracy: 0.9590
Iter-224500 train loss: 0.1026 valid loss: 0.1390, valid accuracy: 0.9588
Iter-224600 train loss: 0.1219 valid loss: 0.1387, valid accuracy: 0.9590
Iter-224700 train loss: 0.0646 valid loss: 0.1385, valid accuracy: 0.9590
Iter-224800 train loss: 0.1060 valid loss: 0.1385, valid accuracy: 0.9596
Iter-224900 train loss: 0.1236 valid loss: 0.1384, valid accuracy: 0.9594
Iter-225000 train loss: 0.0734 valid loss: 0.1384, valid accuracy: 0.9596
Iter-225100 train loss: 0.2209 valid loss: 0.1384, valid accuracy: 0.9596
Iter-225200 train loss: 0.2818 valid l

Iter-235000 train loss: 0.1203 valid loss: 0.1355, valid accuracy: 0.9598
Iter-235100 train loss: 0.1172 valid loss: 0.1355, valid accuracy: 0.9604
Iter-235200 train loss: 0.0683 valid loss: 0.1355, valid accuracy: 0.9606
Iter-235300 train loss: 0.2451 valid loss: 0.1354, valid accuracy: 0.9612
Iter-235400 train loss: 0.2785 valid loss: 0.1355, valid accuracy: 0.9610
Iter-235500 train loss: 0.1177 valid loss: 0.1354, valid accuracy: 0.9602
Iter-235600 train loss: 0.2023 valid loss: 0.1354, valid accuracy: 0.9610
Iter-235700 train loss: 0.0412 valid loss: 0.1356, valid accuracy: 0.9608
Iter-235800 train loss: 0.0389 valid loss: 0.1354, valid accuracy: 0.9602
Iter-235900 train loss: 0.0568 valid loss: 0.1353, valid accuracy: 0.9602
Iter-236000 train loss: 0.2837 valid loss: 0.1353, valid accuracy: 0.9606
Iter-236100 train loss: 0.0627 valid loss: 0.1354, valid accuracy: 0.9610
Iter-236200 train loss: 0.1550 valid loss: 0.1354, valid accuracy: 0.9608
Iter-236300 train loss: 0.0415 valid l

Iter-246100 train loss: 0.0585 valid loss: 0.1329, valid accuracy: 0.9620
Iter-246200 train loss: 0.0913 valid loss: 0.1329, valid accuracy: 0.9624
Iter-246300 train loss: 0.0739 valid loss: 0.1329, valid accuracy: 0.9618
Iter-246400 train loss: 0.1879 valid loss: 0.1330, valid accuracy: 0.9624
Iter-246500 train loss: 0.0443 valid loss: 0.1330, valid accuracy: 0.9626
Iter-246600 train loss: 0.1097 valid loss: 0.1330, valid accuracy: 0.9622
Iter-246700 train loss: 0.1627 valid loss: 0.1330, valid accuracy: 0.9624
Iter-246800 train loss: 0.0930 valid loss: 0.1330, valid accuracy: 0.9620
Iter-246900 train loss: 0.0456 valid loss: 0.1327, valid accuracy: 0.9620
Iter-247000 train loss: 0.0978 valid loss: 0.1327, valid accuracy: 0.9620
Iter-247100 train loss: 0.0282 valid loss: 0.1327, valid accuracy: 0.9620
Iter-247200 train loss: 0.1739 valid loss: 0.1327, valid accuracy: 0.9628
Iter-247300 train loss: 0.0410 valid loss: 0.1328, valid accuracy: 0.9620
Iter-247400 train loss: 0.0791 valid l

Iter-257200 train loss: 0.2562 valid loss: 0.1305, valid accuracy: 0.9616
Iter-257300 train loss: 0.2716 valid loss: 0.1304, valid accuracy: 0.9614
Iter-257400 train loss: 0.0519 valid loss: 0.1305, valid accuracy: 0.9610
Iter-257500 train loss: 0.3224 valid loss: 0.1303, valid accuracy: 0.9612
Iter-257600 train loss: 0.0627 valid loss: 0.1302, valid accuracy: 0.9612
Iter-257700 train loss: 0.0860 valid loss: 0.1301, valid accuracy: 0.9614
Iter-257800 train loss: 0.1002 valid loss: 0.1301, valid accuracy: 0.9618
Iter-257900 train loss: 0.0991 valid loss: 0.1301, valid accuracy: 0.9622
Iter-258000 train loss: 0.0395 valid loss: 0.1301, valid accuracy: 0.9624
Iter-258100 train loss: 0.0648 valid loss: 0.1302, valid accuracy: 0.9624
Iter-258200 train loss: 0.1581 valid loss: 0.1301, valid accuracy: 0.9624
Iter-258300 train loss: 0.0672 valid loss: 0.1301, valid accuracy: 0.9618
Iter-258400 train loss: 0.1428 valid loss: 0.1300, valid accuracy: 0.9618
Iter-258500 train loss: 0.1536 valid l

Iter-268300 train loss: 0.1231 valid loss: 0.1280, valid accuracy: 0.9634
Iter-268400 train loss: 0.1180 valid loss: 0.1279, valid accuracy: 0.9634
Iter-268500 train loss: 0.1084 valid loss: 0.1279, valid accuracy: 0.9628
Iter-268600 train loss: 0.0976 valid loss: 0.1279, valid accuracy: 0.9630
Iter-268700 train loss: 0.0733 valid loss: 0.1280, valid accuracy: 0.9628
Iter-268800 train loss: 0.0548 valid loss: 0.1280, valid accuracy: 0.9632
Iter-268900 train loss: 0.0363 valid loss: 0.1281, valid accuracy: 0.9636
Iter-269000 train loss: 0.2091 valid loss: 0.1281, valid accuracy: 0.9632
Iter-269100 train loss: 0.1896 valid loss: 0.1282, valid accuracy: 0.9632
Iter-269200 train loss: 0.0358 valid loss: 0.1280, valid accuracy: 0.9632
Iter-269300 train loss: 0.1059 valid loss: 0.1280, valid accuracy: 0.9634
Iter-269400 train loss: 0.1975 valid loss: 0.1280, valid accuracy: 0.9638
Iter-269500 train loss: 0.0769 valid loss: 0.1281, valid accuracy: 0.9632
Iter-269600 train loss: 0.1192 valid l

Iter-279400 train loss: 0.2688 valid loss: 0.1257, valid accuracy: 0.9636
Iter-279500 train loss: 0.1253 valid loss: 0.1256, valid accuracy: 0.9632
Iter-279600 train loss: 0.1654 valid loss: 0.1256, valid accuracy: 0.9636
Iter-279700 train loss: 0.1673 valid loss: 0.1257, valid accuracy: 0.9638
Iter-279800 train loss: 0.0240 valid loss: 0.1257, valid accuracy: 0.9634
Iter-279900 train loss: 0.0464 valid loss: 0.1255, valid accuracy: 0.9632
Iter-280000 train loss: 0.0368 valid loss: 0.1255, valid accuracy: 0.9634
Iter-280100 train loss: 0.0384 valid loss: 0.1254, valid accuracy: 0.9638
Iter-280200 train loss: 0.0598 valid loss: 0.1253, valid accuracy: 0.9640
Iter-280300 train loss: 0.0704 valid loss: 0.1252, valid accuracy: 0.9636
Iter-280400 train loss: 0.1489 valid loss: 0.1249, valid accuracy: 0.9638
Iter-280500 train loss: 0.1965 valid loss: 0.1251, valid accuracy: 0.9636
Iter-280600 train loss: 0.1391 valid loss: 0.1252, valid accuracy: 0.9644
Iter-280700 train loss: 0.0199 valid l

Iter-290500 train loss: 0.1495 valid loss: 0.1236, valid accuracy: 0.9650
Iter-290600 train loss: 0.0665 valid loss: 0.1237, valid accuracy: 0.9646
Iter-290700 train loss: 0.0890 valid loss: 0.1237, valid accuracy: 0.9650
Iter-290800 train loss: 0.0731 valid loss: 0.1236, valid accuracy: 0.9646
Iter-290900 train loss: 0.2029 valid loss: 0.1235, valid accuracy: 0.9646
Iter-291000 train loss: 0.0538 valid loss: 0.1234, valid accuracy: 0.9648
Iter-291100 train loss: 0.2044 valid loss: 0.1235, valid accuracy: 0.9648
Iter-291200 train loss: 0.0436 valid loss: 0.1235, valid accuracy: 0.9648
Iter-291300 train loss: 0.1336 valid loss: 0.1235, valid accuracy: 0.9650
Iter-291400 train loss: 0.0568 valid loss: 0.1236, valid accuracy: 0.9646
Iter-291500 train loss: 0.0343 valid loss: 0.1237, valid accuracy: 0.9650
Iter-291600 train loss: 0.0253 valid loss: 0.1240, valid accuracy: 0.9650
Iter-291700 train loss: 0.0879 valid loss: 0.1241, valid accuracy: 0.9644
Iter-291800 train loss: 0.2351 valid l

In [None]:
# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(nn.losses['train'], label='Train loss')
plt.plot(nn.losses['valid'], label='Valid loss')
plt.legend()
plt.show()

In [None]:
plt.plot(nn.losses['valid_acc'], label='Valid accuracy')
plt.legend()
plt.show()