In [1]:
# Data
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import impl.layer as l

# Dataset preparation and pre-processing
mnist = input_data.read_data_sets('data/MNIST_data/', one_hot=False)

X_train, y_train = mnist.train.images, mnist.train.labels
X_val, y_val = mnist.validation.images, mnist.validation.labels
X_test, y_test = mnist.test.images, mnist.test.labels
# y_test.shape, y_val.shape, y_train.shape
# X_train.shape, X_train.dtype, X_val.shape, X_val.dtype, X_test.shape, X_test.dtype

Extracting data/MNIST_data/train-images-idx3-ubyte.gz
Extracting data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting data/MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
# Pre-processing: normalizing
def normalize(X):
    # max scale for images 255= 2**8= 8 bit grayscale for each channel
    return (X - X.mean(axis=0)) #/ X.std(axis=0)

X_train, X_val, X_test = normalize(X=X_train), normalize(X=X_val), normalize(X=X_test)

In [3]:
# Model
import impl.layer as l # or from impl.layer import *
from impl.loss import * # import all functions from impl.loss file # import impl.loss as loss_func
from sklearn.utils import shuffle as skshuffle

class FFNN:

    def __init__(self, D, C, H, L):
        self.L = L # number of layers or depth
        self.losses = {'train':[], 'valid':[], 'valid_acc':[]}
        
        # The adaptive/learnable/updatable random feedforward
        self.model = []
        self.W_fixed = []
        self.grads = []
        self.ys_prev = []
        low, high = -1, 1
        
        # Input layer: weights/ biases
        m = dict(W=np.random.uniform(size=(D, H), low=low, high=high) / np.sqrt(D / 2.), b=np.zeros((1, H)))
        self.model.append(m)
        # Fixed feedback weight
        m = np.random.uniform(size=(D, H), low=low, high=high) / np.sqrt(D / 2.)
        self.W_fixed.append(m)
        # Input layer: gradients
        self.grads.append({key: np.zeros_like(val) for key, val in self.model[0].items()})
        # Previous output layer
        self.ys_prev.append(0.0)

        # Hidden layers: weights/ biases
        m_L = []
        for _ in range(L):
            m = dict(W=np.random.uniform(size=(H, H), low=low, high=high) / np.sqrt(H / 2.), b=np.zeros((1, H)))
            m_L.append(m)
        self.model.append(m_L)
        # Fixed feedback weight
        m_L = []
        for _ in range(L):
            m = np.random.uniform(size=(H, H), low=low, high=high) / np.sqrt(H / 2.)
            m_L.append(m)
        self.W_fixed.append(m_L)
        # Hidden layer: gradients
        grad_L = []
        for _ in range(L):
            grad_L.append({key: np.zeros_like(val) for key, val in self.model[1][0].items()})
        self.grads.append(grad_L)
        # Previous output layer
        ys_prev_L = []
        for _ in range(L):
            ys_prev_L.append(0.0)
        self.ys_prev.append(ys_prev_L)
        
        # Output layer: weights/ biases
        m = dict(W=np.random.uniform(size=(H, C), low=low, high=high) / np.sqrt(H / 2.), b=np.zeros((1, C)))
        self.model.append(m)
        # Fixed feedback weight
        m = np.random.uniform(size=(H, C), low=low, high=high) / np.sqrt(H / 2.)
        self.W_fixed.append(m)
        # Outout layer: gradients
        self.grads.append({key: np.zeros_like(val) for key, val in self.model[2].items()})
        # Previous output layer
        self.ys_prev.append(0.0)
        
    def fc_forward(self, X, W, b):
        out = (X @ W) + b
        cache = (W, X)
        return out, cache

    def fc_backward(self, dout, cache, W_fixed):
        W, X = cache

        dW = X.T @ dout
        db = np.sum(dout, axis=0).reshape(1, -1) # db_1xn
        
#         dX = dout @ W.T # Backprop
        dX = dout @ W_fixed.T # fb alignment

        return dX, dW, db

    def train_forward(self, X, train):
        caches, ys = [], []
        
        # Input layer
        y, fc_cache = self.fc_forward(X=X, W=self.model[0]['W'], b=self.model[0]['b']) # X_1xD, y_1xc
        y, nl_cache = l.tanh_forward(X=y)
        if train:
            caches.append((fc_cache, nl_cache))
        ys.append(y) # ys[0]
        X = y.copy() # pass to the next layer
        
        # Hidden layers
        fc_caches, nl_caches, ys_L = [], [], []
        for layer in range(self.L):
            y, fc_cache = self.fc_forward(X=X, W=self.model[1][layer]['W'], b=self.model[1][layer]['b'])
            y, nl_cache = l.tanh_forward(X=y)
            ys_L.append(y) # ys[1][layer]
            X = y.copy() # pass to next layer
            if train:
                fc_caches.append(fc_cache)
                nl_caches.append(nl_cache)
        if train:
            caches.append((fc_caches, nl_caches)) # caches[1]            
        ys.append(ys_L) # ys[1]            
        
        # Output layer
        y, fc_cache = self.fc_forward(X=X, W=self.model[2]['W'], b=self.model[2]['b'])
        if train:
            caches.append(fc_cache)
        ys.append(y) # ys[2]

        return ys, caches # for backpropating the error

    def loss_function(self, y, y_train):
        
        loss = cross_entropy(y, y_train) # softmax is included
        dy = dcross_entropy(y, y_train) # dsoftmax is included
        
        return loss, dy
        
    def train_backward(self, dy, caches, ys):
        grads, ys_prev = self.grads, self.ys_prev # initialized by Zero in every iteration/epoch
        
        # Output layer
        fc_cache = caches[2]
        dX, dW, db = self.fc_backward(dout=dy, cache=fc_cache, W_fixed=self.W_fixed[2])
        dy = dX.copy()
        grads[2]['W'] = dW
        grads[2]['b'] = db

        # Hidden layer
        fc_caches, nl_caches = caches[1]
        for layer in reversed(range(self.L)):
            dy *= ys[1][layer] - ys_prev[1][layer] # temporal diff instead of differentiable function
#             dy = l.tanh_backward(cache=nl_caches[layer], dout=dy) # diffable function
            dX, dW, db = self.fc_backward(dout=dy, cache=fc_caches[layer], W_fixed=self.W_fixed[1][layer])
            dy = dX.copy()
            grads[1][layer]['W'] = dW
            grads[1][layer]['b'] = db
        
        # Input layer
        fc_cache, nl_cache = caches[0]
        dy *= ys[0] - ys_prev[0] # temporal diff instead of differentiable function
#         dy = l.tanh_backward(cache=nl_cache, dout=dy) # diffable function
        dX, dW, db = self.fc_backward(dout=dy, cache=fc_cache, W_fixed=self.W_fixed[0])
        grads[0]['W'] = dW
        grads[0]['b'] = db

        return dX, grads
    
    def test(self, X):
        ys_logit, _ = self.train_forward(X, train=False)
        y_logit = ys_logit[2] # last layer
        
        # if self.mode == 'classification':
        y_prob = l.softmax(y_logit) # for accuracy == acc
        y_pred = np.argmax(y_prob, axis=1) # for loss ==err
        
        return y_pred, y_logit
        
    def get_minibatch(self, X, y, minibatch_size, shuffle):
        minibatches = []

        if shuffle:
            X, y = skshuffle(X, y)

        for i in range(0, X.shape[0], minibatch_size):
            X_mini = X[i:i + minibatch_size]
            y_mini = y[i:i + minibatch_size]
            minibatches.append((X_mini, y_mini))

        return minibatches

    def sgd(self, train_set, val_set, alpha, mb_size, n_iter, print_after):
        X_train, y_train = train_set
        X_val, y_val = val_set

        # Epochs
        for iter in range(1, n_iter + 1):

            # Minibatches
            minibatches = self.get_minibatch(X_train, y_train, mb_size, shuffle=True)
            idx = np.random.randint(0, len(minibatches))
            X_mini, y_mini = minibatches[idx]
            
            # Train the model
            ys, caches = self.train_forward(X_mini, train=True)
            loss, dy = self.loss_function(ys[2], y_mini)
            _, grads = self.train_backward(dy, caches, ys) # ys[0], ys[1] and ys_prev are used for backprop
            self.ys_prev = ys # for next iteration or epoch learning dW and db
            self.losses['train'].append(loss)
            
            # Update the model for input layer
            for key in grads[0].keys():
                self.model[0][key] -= alpha * grads[0][key]

            # Update the model for the hidden layers
            for layer in range(self.L):
                for key in grads[1][layer].keys():
                    self.model[1][layer][key] -= alpha * grads[1][layer][key]

            # Update the model for output layer
            for key in grads[2].keys():
                self.model[2][key] -= alpha * grads[2][key]
                
            # Validate the updated model
            y_pred, y_logit = self.test(X_val)
            valid_loss, _ = self.loss_function(y_logit, y_val) # softmax is included in entropy loss function
            self.losses['valid'].append(valid_loss)
            valid_acc = np.mean(y_pred == y_val) # confusion matrix
            self.losses['valid_acc'].append(valid_acc)
            
            # Print the model info: loss & accuracy or err & acc
            if iter % print_after == 0:
                print('Iter-{} train loss: {:.4f} valid loss: {:.4f}, valid accuracy: {:.4f}'.format(
                    iter, loss, valid_loss, valid_acc))

        # Test the final model
        y_pred, y_logit = nn.test(X_test)
        loss, _ = self.loss_function(y_logit, y_test) # softmax is included in entropy loss function
        acc = np.mean(y_pred == y_test)
        print('Last iteration - Test accuracy mean: {:.4f}, std: {:.4f}, loss: {:.4f}'.format(
            acc.mean(), acc.std(), loss))

In [None]:
# Hyper-parameters
n_iter = 1000000 # number of epochs
alpha = 1e-3 # learning_rate
mb_size = 50 # 2**10==1024 # width, timestep for sequential data or minibatch size
print_after = 100 # n_iter//10 # print loss for train, valid, and test
num_hidden_units = 32 # number of kernels/ filters in each layer
num_input_units = X_train.shape[1] # noise added at the input lavel as input noise we can use dX or for more improvement
num_output_units = y_train.max() + 1 # number of classes in this classification problem
num_layers = 2 # depth 

# Build the model/NN and learn it: running session.
nn = FFNN(C=num_output_units, D=num_input_units, H=num_hidden_units, L=num_layers)

nn.sgd(train_set=(X_train, y_train), val_set=(X_val, y_val), mb_size=mb_size, alpha=alpha, 
           n_iter=n_iter, print_after=print_after)

Iter-100 train loss: 2.2902 valid loss: 2.2901, valid accuracy: 0.1360
Iter-200 train loss: 2.2858 valid loss: 2.2894, valid accuracy: 0.1370
Iter-300 train loss: 2.2759 valid loss: 2.2886, valid accuracy: 0.1390
Iter-400 train loss: 2.3064 valid loss: 2.2879, valid accuracy: 0.1402
Iter-500 train loss: 2.2864 valid loss: 2.2871, valid accuracy: 0.1422
Iter-600 train loss: 2.2663 valid loss: 2.2864, valid accuracy: 0.1430
Iter-700 train loss: 2.2861 valid loss: 2.2856, valid accuracy: 0.1452
Iter-800 train loss: 2.2894 valid loss: 2.2849, valid accuracy: 0.1464
Iter-900 train loss: 2.2895 valid loss: 2.2842, valid accuracy: 0.1468
Iter-1000 train loss: 2.2675 valid loss: 2.2834, valid accuracy: 0.1488
Iter-1100 train loss: 2.3248 valid loss: 2.2826, valid accuracy: 0.1506
Iter-1200 train loss: 2.2905 valid loss: 2.2819, valid accuracy: 0.1512
Iter-1300 train loss: 2.2785 valid loss: 2.2811, valid accuracy: 0.1526
Iter-1400 train loss: 2.2842 valid loss: 2.2804, valid accuracy: 0.1534
I

Iter-11500 train loss: 2.2216 valid loss: 2.2096, valid accuracy: 0.3096
Iter-11600 train loss: 2.2263 valid loss: 2.2089, valid accuracy: 0.3108
Iter-11700 train loss: 2.2379 valid loss: 2.2083, valid accuracy: 0.3116
Iter-11800 train loss: 2.1800 valid loss: 2.2076, valid accuracy: 0.3136
Iter-11900 train loss: 2.2168 valid loss: 2.2070, valid accuracy: 0.3150
Iter-12000 train loss: 2.2151 valid loss: 2.2063, valid accuracy: 0.3164
Iter-12100 train loss: 2.1815 valid loss: 2.2056, valid accuracy: 0.3178
Iter-12200 train loss: 2.2177 valid loss: 2.2050, valid accuracy: 0.3208
Iter-12300 train loss: 2.1925 valid loss: 2.2043, valid accuracy: 0.3216
Iter-12400 train loss: 2.2278 valid loss: 2.2036, valid accuracy: 0.3226
Iter-12500 train loss: 2.2039 valid loss: 2.2030, valid accuracy: 0.3242
Iter-12600 train loss: 2.2221 valid loss: 2.2023, valid accuracy: 0.3250
Iter-12700 train loss: 2.2446 valid loss: 2.2017, valid accuracy: 0.3250
Iter-12800 train loss: 2.1821 valid loss: 2.2010, v

Iter-22800 train loss: 2.0936 valid loss: 2.1382, valid accuracy: 0.3970
Iter-22900 train loss: 2.1216 valid loss: 2.1376, valid accuracy: 0.3968
Iter-23000 train loss: 2.0918 valid loss: 2.1371, valid accuracy: 0.3972
Iter-23100 train loss: 2.1565 valid loss: 2.1365, valid accuracy: 0.3976
Iter-23200 train loss: 2.1508 valid loss: 2.1358, valid accuracy: 0.3974
Iter-23300 train loss: 2.1325 valid loss: 2.1352, valid accuracy: 0.3978
Iter-23400 train loss: 2.1525 valid loss: 2.1346, valid accuracy: 0.3974
Iter-23500 train loss: 2.1428 valid loss: 2.1340, valid accuracy: 0.3972
Iter-23600 train loss: 2.1173 valid loss: 2.1334, valid accuracy: 0.3978
Iter-23700 train loss: 2.1691 valid loss: 2.1328, valid accuracy: 0.3976
Iter-23800 train loss: 2.1253 valid loss: 2.1322, valid accuracy: 0.3984
Iter-23900 train loss: 2.1064 valid loss: 2.1316, valid accuracy: 0.3984
Iter-24000 train loss: 2.1544 valid loss: 2.1310, valid accuracy: 0.3986
Iter-24100 train loss: 2.0923 valid loss: 2.1304, v

Iter-34100 train loss: 2.0646 valid loss: 2.0728, valid accuracy: 0.4252
Iter-34200 train loss: 2.0723 valid loss: 2.0722, valid accuracy: 0.4256
Iter-34300 train loss: 2.1272 valid loss: 2.0717, valid accuracy: 0.4258
Iter-34400 train loss: 2.0824 valid loss: 2.0711, valid accuracy: 0.4262
Iter-34500 train loss: 2.1188 valid loss: 2.0705, valid accuracy: 0.4268
Iter-34600 train loss: 2.1308 valid loss: 2.0700, valid accuracy: 0.4264
Iter-34700 train loss: 2.0626 valid loss: 2.0694, valid accuracy: 0.4266
Iter-34800 train loss: 2.0804 valid loss: 2.0688, valid accuracy: 0.4268
Iter-34900 train loss: 2.0647 valid loss: 2.0683, valid accuracy: 0.4266
Iter-35000 train loss: 2.0479 valid loss: 2.0677, valid accuracy: 0.4278
Iter-35100 train loss: 2.1354 valid loss: 2.0672, valid accuracy: 0.4280
Iter-35200 train loss: 2.0695 valid loss: 2.0666, valid accuracy: 0.4280
Iter-35300 train loss: 2.0874 valid loss: 2.0660, valid accuracy: 0.4280
Iter-35400 train loss: 2.0230 valid loss: 2.0655, v

Iter-45400 train loss: 1.9669 valid loss: 2.0120, valid accuracy: 0.4442
Iter-45500 train loss: 2.0118 valid loss: 2.0115, valid accuracy: 0.4436
Iter-45600 train loss: 1.9712 valid loss: 2.0110, valid accuracy: 0.4446
Iter-45700 train loss: 1.9797 valid loss: 2.0105, valid accuracy: 0.4448
Iter-45800 train loss: 2.0504 valid loss: 2.0100, valid accuracy: 0.4448
Iter-45900 train loss: 2.0240 valid loss: 2.0094, valid accuracy: 0.4456
Iter-46000 train loss: 2.0404 valid loss: 2.0089, valid accuracy: 0.4456
Iter-46100 train loss: 2.0406 valid loss: 2.0084, valid accuracy: 0.4452
Iter-46200 train loss: 1.9731 valid loss: 2.0079, valid accuracy: 0.4450
Iter-46300 train loss: 2.0059 valid loss: 2.0073, valid accuracy: 0.4452
Iter-46400 train loss: 2.0284 valid loss: 2.0068, valid accuracy: 0.4458
Iter-46500 train loss: 1.9869 valid loss: 2.0063, valid accuracy: 0.4458
Iter-46600 train loss: 1.9729 valid loss: 2.0058, valid accuracy: 0.4460
Iter-46700 train loss: 2.0259 valid loss: 2.0052, v

Iter-56700 train loss: 1.9020 valid loss: 1.9550, valid accuracy: 0.4548
Iter-56800 train loss: 1.9661 valid loss: 1.9545, valid accuracy: 0.4548
Iter-56900 train loss: 2.0455 valid loss: 1.9540, valid accuracy: 0.4550
Iter-57000 train loss: 1.9486 valid loss: 1.9535, valid accuracy: 0.4548
Iter-57100 train loss: 1.9656 valid loss: 1.9530, valid accuracy: 0.4552
Iter-57200 train loss: 1.9897 valid loss: 1.9525, valid accuracy: 0.4556
Iter-57300 train loss: 1.9152 valid loss: 1.9521, valid accuracy: 0.4556
Iter-57400 train loss: 1.9867 valid loss: 1.9515, valid accuracy: 0.4554
Iter-57500 train loss: 2.0300 valid loss: 1.9511, valid accuracy: 0.4558
Iter-57600 train loss: 2.0671 valid loss: 1.9506, valid accuracy: 0.4568
Iter-57700 train loss: 1.9799 valid loss: 1.9501, valid accuracy: 0.4564
Iter-57800 train loss: 1.9958 valid loss: 1.9496, valid accuracy: 0.4560
Iter-57900 train loss: 2.0288 valid loss: 1.9491, valid accuracy: 0.4558
Iter-58000 train loss: 1.9320 valid loss: 1.9486, v

Iter-68000 train loss: 1.8782 valid loss: 1.9014, valid accuracy: 0.4690
Iter-68100 train loss: 1.9178 valid loss: 1.9009, valid accuracy: 0.4692
Iter-68200 train loss: 1.8982 valid loss: 1.9005, valid accuracy: 0.4694
Iter-68300 train loss: 1.8295 valid loss: 1.9000, valid accuracy: 0.4698
Iter-68400 train loss: 1.8918 valid loss: 1.8995, valid accuracy: 0.4692
Iter-68500 train loss: 1.9250 valid loss: 1.8991, valid accuracy: 0.4696
Iter-68600 train loss: 1.9217 valid loss: 1.8986, valid accuracy: 0.4696
Iter-68700 train loss: 1.9506 valid loss: 1.8982, valid accuracy: 0.4694
Iter-68800 train loss: 1.9332 valid loss: 1.8977, valid accuracy: 0.4694
Iter-68900 train loss: 1.8839 valid loss: 1.8973, valid accuracy: 0.4694
Iter-69000 train loss: 1.9370 valid loss: 1.8968, valid accuracy: 0.4698
Iter-69100 train loss: 1.9108 valid loss: 1.8964, valid accuracy: 0.4702
Iter-69200 train loss: 1.9524 valid loss: 1.8959, valid accuracy: 0.4698
Iter-69300 train loss: 1.8356 valid loss: 1.8954, v

Iter-79300 train loss: 1.8975 valid loss: 1.8507, valid accuracy: 0.4792
Iter-79400 train loss: 1.9040 valid loss: 1.8503, valid accuracy: 0.4788
Iter-79500 train loss: 1.8576 valid loss: 1.8498, valid accuracy: 0.4790
Iter-79600 train loss: 1.9043 valid loss: 1.8494, valid accuracy: 0.4792
Iter-79700 train loss: 1.8962 valid loss: 1.8489, valid accuracy: 0.4794
Iter-79800 train loss: 1.8288 valid loss: 1.8485, valid accuracy: 0.4790
Iter-79900 train loss: 1.8229 valid loss: 1.8481, valid accuracy: 0.4790
Iter-80000 train loss: 1.8440 valid loss: 1.8476, valid accuracy: 0.4794
Iter-80100 train loss: 1.8113 valid loss: 1.8472, valid accuracy: 0.4792
Iter-80200 train loss: 1.9027 valid loss: 1.8467, valid accuracy: 0.4790
Iter-80300 train loss: 1.8106 valid loss: 1.8463, valid accuracy: 0.4792
Iter-80400 train loss: 1.7838 valid loss: 1.8459, valid accuracy: 0.4796
Iter-80500 train loss: 1.8624 valid loss: 1.8454, valid accuracy: 0.4798
Iter-80600 train loss: 1.8413 valid loss: 1.8450, v

Iter-90600 train loss: 1.7884 valid loss: 1.8022, valid accuracy: 0.4906
Iter-90700 train loss: 1.8062 valid loss: 1.8017, valid accuracy: 0.4904
Iter-90800 train loss: 1.8619 valid loss: 1.8013, valid accuracy: 0.4902
Iter-90900 train loss: 1.8027 valid loss: 1.8009, valid accuracy: 0.4906
Iter-91000 train loss: 1.9343 valid loss: 1.8005, valid accuracy: 0.4910
Iter-91100 train loss: 1.8712 valid loss: 1.8001, valid accuracy: 0.4906
Iter-91200 train loss: 1.7606 valid loss: 1.7996, valid accuracy: 0.4910
Iter-91300 train loss: 1.7613 valid loss: 1.7992, valid accuracy: 0.4914
Iter-91400 train loss: 1.7813 valid loss: 1.7988, valid accuracy: 0.4910
Iter-91500 train loss: 1.6830 valid loss: 1.7984, valid accuracy: 0.4912
Iter-91600 train loss: 1.8728 valid loss: 1.7980, valid accuracy: 0.4912
Iter-91700 train loss: 1.7631 valid loss: 1.7976, valid accuracy: 0.4916
Iter-91800 train loss: 1.8469 valid loss: 1.7971, valid accuracy: 0.4914
Iter-91900 train loss: 1.8047 valid loss: 1.7968, v

Iter-101800 train loss: 1.7868 valid loss: 1.7564, valid accuracy: 0.4992
Iter-101900 train loss: 1.8039 valid loss: 1.7560, valid accuracy: 0.4990
Iter-102000 train loss: 1.7531 valid loss: 1.7556, valid accuracy: 0.4994
Iter-102100 train loss: 1.7931 valid loss: 1.7552, valid accuracy: 0.4990
Iter-102200 train loss: 1.7396 valid loss: 1.7548, valid accuracy: 0.4996
Iter-102300 train loss: 1.7283 valid loss: 1.7544, valid accuracy: 0.4994
Iter-102400 train loss: 1.7082 valid loss: 1.7540, valid accuracy: 0.4996
Iter-102500 train loss: 1.7312 valid loss: 1.7536, valid accuracy: 0.4996
Iter-102600 train loss: 1.6644 valid loss: 1.7532, valid accuracy: 0.4996
Iter-102700 train loss: 1.6148 valid loss: 1.7528, valid accuracy: 0.4998
Iter-102800 train loss: 1.6787 valid loss: 1.7524, valid accuracy: 0.5002
Iter-102900 train loss: 1.8638 valid loss: 1.7520, valid accuracy: 0.4996
Iter-103000 train loss: 1.8969 valid loss: 1.7516, valid accuracy: 0.5000
Iter-103100 train loss: 1.7740 valid l

Iter-112900 train loss: 1.7201 valid loss: 1.7130, valid accuracy: 0.5082
Iter-113000 train loss: 1.7567 valid loss: 1.7126, valid accuracy: 0.5084
Iter-113100 train loss: 1.6909 valid loss: 1.7122, valid accuracy: 0.5086
Iter-113200 train loss: 1.7178 valid loss: 1.7118, valid accuracy: 0.5084
Iter-113300 train loss: 1.6759 valid loss: 1.7115, valid accuracy: 0.5082
Iter-113400 train loss: 1.6493 valid loss: 1.7111, valid accuracy: 0.5086
Iter-113500 train loss: 1.7304 valid loss: 1.7107, valid accuracy: 0.5086
Iter-113600 train loss: 1.6158 valid loss: 1.7103, valid accuracy: 0.5086
Iter-113700 train loss: 1.8920 valid loss: 1.7100, valid accuracy: 0.5084
Iter-113800 train loss: 1.6470 valid loss: 1.7096, valid accuracy: 0.5086
Iter-113900 train loss: 1.7644 valid loss: 1.7092, valid accuracy: 0.5086
Iter-114000 train loss: 1.8218 valid loss: 1.7088, valid accuracy: 0.5086
Iter-114100 train loss: 1.7750 valid loss: 1.7084, valid accuracy: 0.5086
Iter-114200 train loss: 1.7926 valid l

Iter-124000 train loss: 1.7320 valid loss: 1.6714, valid accuracy: 0.5178
Iter-124100 train loss: 1.6443 valid loss: 1.6711, valid accuracy: 0.5180
Iter-124200 train loss: 1.7188 valid loss: 1.6707, valid accuracy: 0.5182
Iter-124300 train loss: 1.6430 valid loss: 1.6704, valid accuracy: 0.5182
Iter-124400 train loss: 1.6445 valid loss: 1.6700, valid accuracy: 0.5184
Iter-124500 train loss: 1.7354 valid loss: 1.6696, valid accuracy: 0.5182
Iter-124600 train loss: 1.6372 valid loss: 1.6692, valid accuracy: 0.5184
Iter-124700 train loss: 1.6832 valid loss: 1.6689, valid accuracy: 0.5182
Iter-124800 train loss: 1.6869 valid loss: 1.6685, valid accuracy: 0.5180
Iter-124900 train loss: 1.5320 valid loss: 1.6681, valid accuracy: 0.5184
Iter-125000 train loss: 1.7502 valid loss: 1.6678, valid accuracy: 0.5190
Iter-125100 train loss: 1.7423 valid loss: 1.6674, valid accuracy: 0.5188
Iter-125200 train loss: 1.7391 valid loss: 1.6670, valid accuracy: 0.5186
Iter-125300 train loss: 1.6625 valid l

Iter-135100 train loss: 1.7938 valid loss: 1.6314, valid accuracy: 0.5276
Iter-135200 train loss: 1.6328 valid loss: 1.6311, valid accuracy: 0.5278
Iter-135300 train loss: 1.6284 valid loss: 1.6307, valid accuracy: 0.5278
Iter-135400 train loss: 1.5241 valid loss: 1.6304, valid accuracy: 0.5276
Iter-135500 train loss: 1.7397 valid loss: 1.6301, valid accuracy: 0.5276
Iter-135600 train loss: 1.5686 valid loss: 1.6297, valid accuracy: 0.5280
Iter-135700 train loss: 1.6547 valid loss: 1.6294, valid accuracy: 0.5282
Iter-135800 train loss: 1.7534 valid loss: 1.6290, valid accuracy: 0.5282
Iter-135900 train loss: 1.6017 valid loss: 1.6287, valid accuracy: 0.5280
Iter-136000 train loss: 1.5571 valid loss: 1.6283, valid accuracy: 0.5282
Iter-136100 train loss: 1.5887 valid loss: 1.6280, valid accuracy: 0.5282
Iter-136200 train loss: 1.4503 valid loss: 1.6276, valid accuracy: 0.5284
Iter-136300 train loss: 1.8151 valid loss: 1.6273, valid accuracy: 0.5288
Iter-136400 train loss: 1.4789 valid l

Iter-146200 train loss: 1.5080 valid loss: 1.5930, valid accuracy: 0.5374
Iter-146300 train loss: 1.6450 valid loss: 1.5926, valid accuracy: 0.5376
Iter-146400 train loss: 1.6494 valid loss: 1.5923, valid accuracy: 0.5378
Iter-146500 train loss: 1.6015 valid loss: 1.5919, valid accuracy: 0.5382
Iter-146600 train loss: 1.5780 valid loss: 1.5916, valid accuracy: 0.5374
Iter-146700 train loss: 1.6152 valid loss: 1.5913, valid accuracy: 0.5378
Iter-146800 train loss: 1.6021 valid loss: 1.5909, valid accuracy: 0.5378
Iter-146900 train loss: 1.5477 valid loss: 1.5906, valid accuracy: 0.5382
Iter-147000 train loss: 1.6103 valid loss: 1.5903, valid accuracy: 0.5380
Iter-147100 train loss: 1.5319 valid loss: 1.5900, valid accuracy: 0.5378
Iter-147200 train loss: 1.5682 valid loss: 1.5896, valid accuracy: 0.5384
Iter-147300 train loss: 1.7099 valid loss: 1.5893, valid accuracy: 0.5382
Iter-147400 train loss: 1.5261 valid loss: 1.5890, valid accuracy: 0.5384
Iter-147500 train loss: 1.7117 valid l

Iter-157300 train loss: 1.5443 valid loss: 1.5559, valid accuracy: 0.5444
Iter-157400 train loss: 1.4880 valid loss: 1.5556, valid accuracy: 0.5446
Iter-157500 train loss: 1.6480 valid loss: 1.5553, valid accuracy: 0.5444
Iter-157600 train loss: 1.4322 valid loss: 1.5550, valid accuracy: 0.5446
Iter-157700 train loss: 1.5260 valid loss: 1.5547, valid accuracy: 0.5450
Iter-157800 train loss: 1.7370 valid loss: 1.5543, valid accuracy: 0.5452
Iter-157900 train loss: 1.5329 valid loss: 1.5540, valid accuracy: 0.5452
Iter-158000 train loss: 1.4867 valid loss: 1.5537, valid accuracy: 0.5456
Iter-158100 train loss: 1.5221 valid loss: 1.5533, valid accuracy: 0.5454
Iter-158200 train loss: 1.5325 valid loss: 1.5530, valid accuracy: 0.5454
Iter-158300 train loss: 1.6778 valid loss: 1.5526, valid accuracy: 0.5460
Iter-158400 train loss: 1.5909 valid loss: 1.5523, valid accuracy: 0.5462
Iter-158500 train loss: 1.6237 valid loss: 1.5520, valid accuracy: 0.5466
Iter-158600 train loss: 1.5930 valid l

Iter-168400 train loss: 1.5604 valid loss: 1.5201, valid accuracy: 0.5542
Iter-168500 train loss: 1.5086 valid loss: 1.5198, valid accuracy: 0.5544
Iter-168600 train loss: 1.5084 valid loss: 1.5194, valid accuracy: 0.5546
Iter-168700 train loss: 1.5059 valid loss: 1.5191, valid accuracy: 0.5542
Iter-168800 train loss: 1.6824 valid loss: 1.5188, valid accuracy: 0.5544
Iter-168900 train loss: 1.5197 valid loss: 1.5185, valid accuracy: 0.5548
Iter-169000 train loss: 1.4420 valid loss: 1.5182, valid accuracy: 0.5540
Iter-169100 train loss: 1.4715 valid loss: 1.5178, valid accuracy: 0.5544
Iter-169200 train loss: 1.5137 valid loss: 1.5175, valid accuracy: 0.5548
Iter-169300 train loss: 1.7079 valid loss: 1.5172, valid accuracy: 0.5546
Iter-169400 train loss: 1.5482 valid loss: 1.5169, valid accuracy: 0.5544
Iter-169500 train loss: 1.6103 valid loss: 1.5166, valid accuracy: 0.5548
Iter-169600 train loss: 1.3949 valid loss: 1.5163, valid accuracy: 0.5550
Iter-169700 train loss: 1.4648 valid l

Iter-179500 train loss: 1.4976 valid loss: 1.4853, valid accuracy: 0.5640
Iter-179600 train loss: 1.3954 valid loss: 1.4850, valid accuracy: 0.5636
Iter-179700 train loss: 1.5132 valid loss: 1.4847, valid accuracy: 0.5636
Iter-179800 train loss: 1.4913 valid loss: 1.4844, valid accuracy: 0.5640
Iter-179900 train loss: 1.4286 valid loss: 1.4841, valid accuracy: 0.5640
Iter-180000 train loss: 1.5432 valid loss: 1.4838, valid accuracy: 0.5642
Iter-180100 train loss: 1.5524 valid loss: 1.4835, valid accuracy: 0.5646
Iter-180200 train loss: 1.4959 valid loss: 1.4832, valid accuracy: 0.5644
Iter-180300 train loss: 1.5004 valid loss: 1.4829, valid accuracy: 0.5644
Iter-180400 train loss: 1.5562 valid loss: 1.4826, valid accuracy: 0.5640
Iter-180500 train loss: 1.4177 valid loss: 1.4823, valid accuracy: 0.5642
Iter-180600 train loss: 1.5686 valid loss: 1.4820, valid accuracy: 0.5644
Iter-180700 train loss: 1.4810 valid loss: 1.4816, valid accuracy: 0.5646
Iter-180800 train loss: 1.3930 valid l

Iter-190600 train loss: 1.3927 valid loss: 1.4520, valid accuracy: 0.5716
Iter-190700 train loss: 1.5235 valid loss: 1.4517, valid accuracy: 0.5712
Iter-190800 train loss: 1.5107 valid loss: 1.4513, valid accuracy: 0.5710
Iter-190900 train loss: 1.4251 valid loss: 1.4511, valid accuracy: 0.5712
Iter-191000 train loss: 1.3977 valid loss: 1.4508, valid accuracy: 0.5718
Iter-191100 train loss: 1.4126 valid loss: 1.4505, valid accuracy: 0.5720
Iter-191200 train loss: 1.4723 valid loss: 1.4502, valid accuracy: 0.5722
Iter-191300 train loss: 1.5038 valid loss: 1.4499, valid accuracy: 0.5726
Iter-191400 train loss: 1.3591 valid loss: 1.4495, valid accuracy: 0.5724
Iter-191500 train loss: 1.4182 valid loss: 1.4493, valid accuracy: 0.5722
Iter-191600 train loss: 1.5027 valid loss: 1.4490, valid accuracy: 0.5724
Iter-191700 train loss: 1.3912 valid loss: 1.4487, valid accuracy: 0.5726
Iter-191800 train loss: 1.3121 valid loss: 1.4484, valid accuracy: 0.5724
Iter-191900 train loss: 1.3351 valid l

Iter-201700 train loss: 1.5046 valid loss: 1.4197, valid accuracy: 0.5784
Iter-201800 train loss: 1.5337 valid loss: 1.4194, valid accuracy: 0.5780
Iter-201900 train loss: 1.2951 valid loss: 1.4191, valid accuracy: 0.5784
Iter-202000 train loss: 1.3952 valid loss: 1.4188, valid accuracy: 0.5784
Iter-202100 train loss: 1.5349 valid loss: 1.4185, valid accuracy: 0.5784
Iter-202200 train loss: 1.3579 valid loss: 1.4182, valid accuracy: 0.5784
Iter-202300 train loss: 1.4941 valid loss: 1.4179, valid accuracy: 0.5784
Iter-202400 train loss: 1.4787 valid loss: 1.4177, valid accuracy: 0.5784
Iter-202500 train loss: 1.4685 valid loss: 1.4174, valid accuracy: 0.5778
Iter-202600 train loss: 1.3403 valid loss: 1.4171, valid accuracy: 0.5780
Iter-202700 train loss: 1.4775 valid loss: 1.4168, valid accuracy: 0.5780
Iter-202800 train loss: 1.4268 valid loss: 1.4165, valid accuracy: 0.5780
Iter-202900 train loss: 1.3505 valid loss: 1.4162, valid accuracy: 0.5786
Iter-203000 train loss: 1.2901 valid l

Iter-212800 train loss: 1.3267 valid loss: 1.3888, valid accuracy: 0.5848
Iter-212900 train loss: 1.3401 valid loss: 1.3885, valid accuracy: 0.5850
Iter-213000 train loss: 1.3899 valid loss: 1.3882, valid accuracy: 0.5848
Iter-213100 train loss: 1.4283 valid loss: 1.3880, valid accuracy: 0.5842
Iter-213200 train loss: 1.5286 valid loss: 1.3877, valid accuracy: 0.5848
Iter-213300 train loss: 1.2957 valid loss: 1.3874, valid accuracy: 0.5850
Iter-213400 train loss: 1.4409 valid loss: 1.3871, valid accuracy: 0.5850
Iter-213500 train loss: 1.2929 valid loss: 1.3868, valid accuracy: 0.5852
Iter-213600 train loss: 1.2992 valid loss: 1.3866, valid accuracy: 0.5848
Iter-213700 train loss: 1.5958 valid loss: 1.3863, valid accuracy: 0.5848
Iter-213800 train loss: 1.4210 valid loss: 1.3860, valid accuracy: 0.5848
Iter-213900 train loss: 1.4920 valid loss: 1.3858, valid accuracy: 0.5852
Iter-214000 train loss: 1.5198 valid loss: 1.3855, valid accuracy: 0.5850
Iter-214100 train loss: 1.2741 valid l

Iter-223900 train loss: 1.4328 valid loss: 1.3591, valid accuracy: 0.5878
Iter-224000 train loss: 1.3298 valid loss: 1.3588, valid accuracy: 0.5880
Iter-224100 train loss: 1.3683 valid loss: 1.3585, valid accuracy: 0.5880
Iter-224200 train loss: 1.3303 valid loss: 1.3583, valid accuracy: 0.5882
Iter-224300 train loss: 1.5759 valid loss: 1.3580, valid accuracy: 0.5882
Iter-224400 train loss: 1.0602 valid loss: 1.3577, valid accuracy: 0.5884
Iter-224500 train loss: 1.4885 valid loss: 1.3575, valid accuracy: 0.5886
Iter-224600 train loss: 1.3249 valid loss: 1.3572, valid accuracy: 0.5886
Iter-224700 train loss: 1.3235 valid loss: 1.3570, valid accuracy: 0.5888
Iter-224800 train loss: 1.4723 valid loss: 1.3567, valid accuracy: 0.5886
Iter-224900 train loss: 1.3509 valid loss: 1.3564, valid accuracy: 0.5888
Iter-225000 train loss: 1.1645 valid loss: 1.3561, valid accuracy: 0.5888
Iter-225100 train loss: 1.5002 valid loss: 1.3559, valid accuracy: 0.5892
Iter-225200 train loss: 1.3825 valid l

Iter-235000 train loss: 1.3485 valid loss: 1.3305, valid accuracy: 0.5928
Iter-235100 train loss: 1.3962 valid loss: 1.3302, valid accuracy: 0.5930
Iter-235200 train loss: 1.1634 valid loss: 1.3299, valid accuracy: 0.5930
Iter-235300 train loss: 1.3458 valid loss: 1.3297, valid accuracy: 0.5932
Iter-235400 train loss: 1.2158 valid loss: 1.3294, valid accuracy: 0.5934
Iter-235500 train loss: 1.1754 valid loss: 1.3292, valid accuracy: 0.5934
Iter-235600 train loss: 1.4830 valid loss: 1.3289, valid accuracy: 0.5936
Iter-235700 train loss: 1.3757 valid loss: 1.3287, valid accuracy: 0.5936
Iter-235800 train loss: 1.3437 valid loss: 1.3284, valid accuracy: 0.5936
Iter-235900 train loss: 1.4635 valid loss: 1.3282, valid accuracy: 0.5934
Iter-236000 train loss: 1.3488 valid loss: 1.3280, valid accuracy: 0.5934
Iter-236100 train loss: 1.3283 valid loss: 1.3277, valid accuracy: 0.5936
Iter-236200 train loss: 1.4071 valid loss: 1.3274, valid accuracy: 0.5936
Iter-236300 train loss: 1.4606 valid l

Iter-246100 train loss: 1.3032 valid loss: 1.3031, valid accuracy: 0.5994
Iter-246200 train loss: 1.3467 valid loss: 1.3029, valid accuracy: 0.5998
Iter-246300 train loss: 1.3189 valid loss: 1.3027, valid accuracy: 0.5998
Iter-246400 train loss: 1.2594 valid loss: 1.3024, valid accuracy: 0.5998
Iter-246500 train loss: 1.3034 valid loss: 1.3022, valid accuracy: 0.6002
Iter-246600 train loss: 1.3877 valid loss: 1.3020, valid accuracy: 0.6002
Iter-246700 train loss: 1.2507 valid loss: 1.3017, valid accuracy: 0.6002
Iter-246800 train loss: 1.3195 valid loss: 1.3014, valid accuracy: 0.6004
Iter-246900 train loss: 1.1585 valid loss: 1.3012, valid accuracy: 0.6000
Iter-247000 train loss: 1.2304 valid loss: 1.3010, valid accuracy: 0.5998
Iter-247100 train loss: 1.0249 valid loss: 1.3007, valid accuracy: 0.6000
Iter-247200 train loss: 1.4326 valid loss: 1.3005, valid accuracy: 0.5998
Iter-247300 train loss: 1.4007 valid loss: 1.3003, valid accuracy: 0.6000
Iter-247400 train loss: 1.2818 valid l

Iter-257200 train loss: 1.3664 valid loss: 1.2767, valid accuracy: 0.6052
Iter-257300 train loss: 1.3548 valid loss: 1.2764, valid accuracy: 0.6058
Iter-257400 train loss: 1.3057 valid loss: 1.2762, valid accuracy: 0.6052
Iter-257500 train loss: 1.5877 valid loss: 1.2759, valid accuracy: 0.6054
Iter-257600 train loss: 1.4149 valid loss: 1.2757, valid accuracy: 0.6054
Iter-257700 train loss: 1.2031 valid loss: 1.2755, valid accuracy: 0.6056
Iter-257800 train loss: 1.4905 valid loss: 1.2753, valid accuracy: 0.6058
Iter-257900 train loss: 1.4762 valid loss: 1.2750, valid accuracy: 0.6056
Iter-258000 train loss: 1.3566 valid loss: 1.2748, valid accuracy: 0.6058
Iter-258100 train loss: 1.2639 valid loss: 1.2746, valid accuracy: 0.6060
Iter-258200 train loss: 1.3911 valid loss: 1.2743, valid accuracy: 0.6062
Iter-258300 train loss: 1.1650 valid loss: 1.2741, valid accuracy: 0.6062
Iter-258400 train loss: 1.3546 valid loss: 1.2739, valid accuracy: 0.6060
Iter-258500 train loss: 1.2843 valid l

Iter-268300 train loss: 1.2894 valid loss: 1.2517, valid accuracy: 0.6128
Iter-268400 train loss: 1.3617 valid loss: 1.2515, valid accuracy: 0.6126
Iter-268500 train loss: 1.2609 valid loss: 1.2513, valid accuracy: 0.6128
Iter-268600 train loss: 1.1343 valid loss: 1.2511, valid accuracy: 0.6126
Iter-268700 train loss: 0.9938 valid loss: 1.2508, valid accuracy: 0.6128
Iter-268800 train loss: 1.3426 valid loss: 1.2506, valid accuracy: 0.6128
Iter-268900 train loss: 1.3441 valid loss: 1.2504, valid accuracy: 0.6126
Iter-269000 train loss: 0.9935 valid loss: 1.2502, valid accuracy: 0.6126
Iter-269100 train loss: 1.1478 valid loss: 1.2500, valid accuracy: 0.6130
Iter-269200 train loss: 1.1546 valid loss: 1.2497, valid accuracy: 0.6128
Iter-269300 train loss: 1.2147 valid loss: 1.2495, valid accuracy: 0.6128
Iter-269400 train loss: 1.0771 valid loss: 1.2493, valid accuracy: 0.6126
Iter-269500 train loss: 1.2119 valid loss: 1.2491, valid accuracy: 0.6126
Iter-269600 train loss: 1.1358 valid l

Iter-279400 train loss: 1.0207 valid loss: 1.2278, valid accuracy: 0.6186
Iter-279500 train loss: 1.0681 valid loss: 1.2276, valid accuracy: 0.6186
Iter-279600 train loss: 1.0075 valid loss: 1.2274, valid accuracy: 0.6188
Iter-279700 train loss: 1.4365 valid loss: 1.2272, valid accuracy: 0.6186
Iter-279800 train loss: 1.2199 valid loss: 1.2270, valid accuracy: 0.6184
Iter-279900 train loss: 1.1188 valid loss: 1.2268, valid accuracy: 0.6186
Iter-280000 train loss: 1.1769 valid loss: 1.2265, valid accuracy: 0.6186
Iter-280100 train loss: 1.2127 valid loss: 1.2263, valid accuracy: 0.6188
Iter-280200 train loss: 1.1960 valid loss: 1.2261, valid accuracy: 0.6190
Iter-280300 train loss: 1.1193 valid loss: 1.2259, valid accuracy: 0.6186
Iter-280400 train loss: 1.2159 valid loss: 1.2257, valid accuracy: 0.6188
Iter-280500 train loss: 1.0285 valid loss: 1.2255, valid accuracy: 0.6192
Iter-280600 train loss: 1.1589 valid loss: 1.2252, valid accuracy: 0.6190
Iter-280700 train loss: 1.2852 valid l

Iter-290500 train loss: 1.2970 valid loss: 1.2047, valid accuracy: 0.6252
Iter-290600 train loss: 1.2225 valid loss: 1.2045, valid accuracy: 0.6250
Iter-290700 train loss: 1.1849 valid loss: 1.2043, valid accuracy: 0.6250
Iter-290800 train loss: 1.1070 valid loss: 1.2041, valid accuracy: 0.6250
Iter-290900 train loss: 1.1757 valid loss: 1.2039, valid accuracy: 0.6256
Iter-291000 train loss: 1.2862 valid loss: 1.2037, valid accuracy: 0.6256
Iter-291100 train loss: 1.1087 valid loss: 1.2035, valid accuracy: 0.6256
Iter-291200 train loss: 1.1085 valid loss: 1.2033, valid accuracy: 0.6254
Iter-291300 train loss: 1.0943 valid loss: 1.2031, valid accuracy: 0.6260
Iter-291400 train loss: 1.3021 valid loss: 1.2029, valid accuracy: 0.6266
Iter-291500 train loss: 1.2595 valid loss: 1.2027, valid accuracy: 0.6270
Iter-291600 train loss: 1.1628 valid loss: 1.2024, valid accuracy: 0.6276
Iter-291700 train loss: 1.2394 valid loss: 1.2022, valid accuracy: 0.6276
Iter-291800 train loss: 1.1411 valid l

Iter-301600 train loss: 1.5001 valid loss: 1.1826, valid accuracy: 0.6300
Iter-301700 train loss: 1.3249 valid loss: 1.1825, valid accuracy: 0.6300
Iter-301800 train loss: 1.2507 valid loss: 1.1823, valid accuracy: 0.6300
Iter-301900 train loss: 1.1641 valid loss: 1.1821, valid accuracy: 0.6298
Iter-302000 train loss: 1.1359 valid loss: 1.1819, valid accuracy: 0.6296
Iter-302100 train loss: 1.2433 valid loss: 1.1817, valid accuracy: 0.6292
Iter-302200 train loss: 1.1700 valid loss: 1.1815, valid accuracy: 0.6296
Iter-302300 train loss: 1.1952 valid loss: 1.1813, valid accuracy: 0.6294
Iter-302400 train loss: 1.3622 valid loss: 1.1811, valid accuracy: 0.6290
Iter-302500 train loss: 1.2694 valid loss: 1.1809, valid accuracy: 0.6286
Iter-302600 train loss: 1.0791 valid loss: 1.1808, valid accuracy: 0.6288
Iter-302700 train loss: 1.3402 valid loss: 1.1806, valid accuracy: 0.6288
Iter-302800 train loss: 1.2321 valid loss: 1.1804, valid accuracy: 0.6292
Iter-302900 train loss: 1.1071 valid l

In [None]:
# # Display the learning curve and losses for training, validation, and testing
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

plt.plot(nn.losses['train'], label='Train loss')
plt.plot(nn.losses['valid'], label='Valid loss')
plt.legend()
plt.show()

In [None]:
plt.plot(nn.losses['valid_acc'], label='Valid accuracy')
plt.legend()
plt.show()