#Setup

In [10]:
import math
import random

import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
from torch.nn.parameter import Parameter, UninitializedParameter
import torch.nn.functional as F
from torch.nn.modules.module import Module
from torch.nn import init
from tqdm.notebook import trange, tqdm
import torchvision.transforms.functional as TF
from typing import Optional, List, Tuple, Union
from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
import itertools
import numpy as np

In [11]:
dataset = 'FashionMNIST'
datadir = 'datasets'
arch = 'hhnmlpb'
batchsize = 64
epochs = 50
nlayers = 1
width = 32
lr = 0.001
dimensions = 5
hin = 2
nchannels = 1
nclasses = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# Download training data from open datasets
training_data = datasets.FashionMNIST(root="data", train=True, download=True, 
                                      transform=transforms.Compose([transforms.Resize((16, 16)),
                                                                    transforms.ToTensor()]),)

# Download test data from open datasets
test_data = datasets.FashionMNIST(root="data", train=False, download=True,
                                  transform=transforms.Compose([transforms.Resize((16, 16)),
                                                                    transforms.ToTensor()]),)

In [13]:
# Create data loaders
train_loader = DataLoader(training_data, batch_size=batchsize, shuffle=True, num_workers=8, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batchsize, shuffle=True, num_workers=8, pin_memory=True)

for X, y in test_loader:
    print("Shape of X [N, C, H, W]: ", X.shape)
    print("Shape of y: ", y.shape, y.dtype)
    break

Shape of X [N, C, H, W]:  torch.Size([64, 1, 16, 16])
Shape of y:  torch.Size([64]) torch.int64


# MLP with search in alpha-space



In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def count_model_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

class HHN_MLPB(nn.Module):
    def __init__(self, hin, dimensions, n_layers, n_units, n_channels, n_classes=10):
        super(HHN_MLPB, self).__init__()
        self.hyper_stack = nn.Sequential(
            nn.Linear(hin, 64),
            nn.ReLU(),
            nn.Linear(64, dimensions),
            nn.Softmax(dim=0)
        )

        self.dimensions = dimensions
        self.n_layers = n_layers
        self.n_units = n_units
        self.n_channels = n_channels
        self.n_classes = n_classes

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.bns = []

        self.weight_list_fc1, self.bias_list_fc1 = \
            self.create_param_combination_linear(in_features=16 * 16 * n_channels, out_features=n_units)
        self.weights = nn.ParameterList()
        self.biases = nn.ParameterList()
        for _ in range(n_layers - 1):
            w, b = self.create_param_combination_linear(in_features=n_units, out_features=n_units)
            self.weights += w
            self.biases += b
            self.bns.append(nn.BatchNorm1d(self.n_units).to(self.device))
        self.weight_list_fc2, self.bias_list_fc2 = self.create_param_combination_linear(in_features=n_units,
                                                                                        out_features=n_classes)

    def create_param_combination_linear(self, in_features, out_features):
        weight_list = nn.ParameterList()
        bias_list = nn.ParameterList()
        for _ in range(self.dimensions):
            weight = Parameter(torch.empty((out_features, in_features)))
            nn.init.kaiming_uniform_(weight, a=math.sqrt(5))
            weight_list.append(weight)

            bias = Parameter(torch.empty(out_features))
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(bias, -bound, bound)
            bias_list.append(bias)
        return weight_list, bias_list

    def calculate_weighted_sum(self, param_list: List, factors: Tensor):
        weighted_list = [a * b for a, b in zip(param_list, factors)]
        return torch.sum(torch.stack(weighted_list), dim=0)

    def forward(self, x, hyper_x):
        hyper_output = self.hyper_stack(hyper_x)

        weight_fc1 = self.calculate_weighted_sum(self.weight_list_fc1, hyper_output)
        weight_fc2 = self.calculate_weighted_sum(self.weight_list_fc2, hyper_output)

        bias_fc1 = self.calculate_weighted_sum(self.bias_list_fc1, hyper_output)
        bias_fc2 = self.calculate_weighted_sum(self.bias_list_fc2, hyper_output)

        logits = torch.flatten(x, start_dim=1)
        logits = F.linear(logits, weight=weight_fc1, bias=bias_fc1)
        logits = torch.relu(logits)

        it_w = iter(self.weights)
        it_b = iter(self.biases)
        for (w, b, bn) in zip(zip(*[it_w] * self.dimensions), zip(*[it_b] * self.dimensions), self.bns):
            w = nn.ParameterList(w)
            b = nn.ParameterList(b)
            w = self.calculate_weighted_sum(w.to(self.device), hyper_output)
            b = self.calculate_weighted_sum(b.to(self.device), hyper_output)
            logits = F.linear(logits, weight=w, bias=b)
            logits = bn(logits)
            logits = torch.relu(logits)
        logits = F.linear(logits, weight=weight_fc2, bias=bias_fc2)
        return logits

model = HHN_MLPB(2, dimensions, nlayers, width, 1, n_classes=10).to(device)
print(model)
print(count_model_parameters(model))

HHN_MLPB(
  (hyper_stack): Sequential(
    (0): Linear(in_features=2, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=5, bias=True)
    (3): Softmax(dim=0)
  )
  (weight_list_fc1): ParameterList(
      (0): Parameter containing: [torch.float32 of size 32x256 (GPU 0)]
      (1): Parameter containing: [torch.float32 of size 32x256 (GPU 0)]
      (2): Parameter containing: [torch.float32 of size 32x256 (GPU 0)]
      (3): Parameter containing: [torch.float32 of size 32x256 (GPU 0)]
      (4): Parameter containing: [torch.float32 of size 32x256 (GPU 0)]
  )
  (bias_list_fc1): ParameterList(
      (0): Parameter containing: [torch.float32 of size 32 (GPU 0)]
      (1): Parameter containing: [torch.float32 of size 32 (GPU 0)]
      (2): Parameter containing: [torch.float32 of size 32 (GPU 0)]
      (3): Parameter containing: [torch.float32 of size 32 (GPU 0)]
      (4): Parameter containing: [torch.float32 of size 32 (GPU 0)]
  )
  (weights): Paramete

In [15]:
def transform_angle(angle):
    cos = math.cos(angle / 180 * math.pi)
    sin = math.sin(angle / 180 * math.pi)
    return Tensor([cos, sin])

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0)

cos = nn.CosineSimilarity(dim=0, eps=1e-6)

def train(dataloader, model, loss_fn, optimizer):
    for batch, (X, y) in enumerate(tqdm(dataloader, desc='Training')):
        X, y = X.to(device), y.to(device)
        angle = random.uniform(0, 360)
        X = TF.rotate(X, angle)

        # make prediction and minimize loss
        pred = model(X, hyper_x=transform_angle(angle).to(device))
        loss = loss_fn(pred, y)

        # regularizer (cossim squared) in the beta space
        beta1 = model.hyper_stack(transform_angle(angle).to(device))
        angle2 = random.uniform(0, 360)
        beta2 = model.hyper_stack(transform_angle(angle2).to(device))
        loss += pow(cos(beta1, beta2),2)

        # minimize entropy to the correct degree
        b = (F.softmax(pred, dim=1)) * (-1 * F.log_softmax(pred, dim=1))
        loss += 0.01*b.sum()

        # maximize entropy to the wrong degree
        logits = model(X, hyper_x=transform_angle(angle2).to(device))
        b2 = (F.softmax(logits, dim=1)) * (-1 * F.log_softmax(logits, dim=1))
        loss -= 0.01*b2.sum()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # scheduler.step()

def validate(dataloader, model, loss_fn):
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            angle = random.uniform(0, 360)
            X = TF.rotate(X, angle)

            pred = model(X, hyper_x=transform_angle(angle).to(device))
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= len(dataloader)
    correct /= len(dataloader.dataset)
    print(f"Test with angle={angle}: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return correct, test_loss

for t in range(epochs):
    print(f"=================\n Epoch: {t + 1} \n=================")
    train(train_loader, model, loss_fn, optimizer)
    test_acc, test_loss = validate(test_loader, model, loss_fn)
print("Done!")

 Epoch: 1 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=204.7432889822147: Accuracy: 65.8%, Avg loss: 0.920305
 Epoch: 2 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=24.165027934953788: Accuracy: 69.6%, Avg loss: 0.822181
 Epoch: 3 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=354.5426166719326: Accuracy: 71.2%, Avg loss: 0.775258
 Epoch: 4 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=288.9386892720821: Accuracy: 72.7%, Avg loss: 0.746091
 Epoch: 5 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=99.75921225599285: Accuracy: 73.0%, Avg loss: 0.728768
 Epoch: 6 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=222.0636839678087: Accuracy: 73.5%, Avg loss: 0.740129
 Epoch: 7 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=216.610319994571: Accuracy: 75.0%, Avg loss: 0.699872
 Epoch: 8 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=27.801479122131127: Accuracy: 74.9%, Avg loss: 0.697307
 Epoch: 9 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=47.51158727936344: Accuracy: 75.5%, Avg loss: 0.700243
 Epoch: 10 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=88.70100223581426: Accuracy: 75.5%, Avg loss: 0.684158
 Epoch: 11 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=344.6481807000533: Accuracy: 76.0%, Avg loss: 0.682243
 Epoch: 12 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=329.40161133742237: Accuracy: 76.5%, Avg loss: 0.666490
 Epoch: 13 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=314.0351226810208: Accuracy: 76.3%, Avg loss: 0.680499
 Epoch: 14 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=134.03683238058522: Accuracy: 76.7%, Avg loss: 0.652831
 Epoch: 15 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=206.33701407032964: Accuracy: 77.5%, Avg loss: 0.644403
 Epoch: 16 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=81.75208624376626: Accuracy: 77.1%, Avg loss: 0.653079
 Epoch: 17 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=76.87449002041178: Accuracy: 77.2%, Avg loss: 0.652038
 Epoch: 18 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=188.06907535247493: Accuracy: 77.3%, Avg loss: 0.653159
 Epoch: 19 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=315.47213281791255: Accuracy: 77.8%, Avg loss: 0.635918
 Epoch: 20 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=97.2748884816007: Accuracy: 77.7%, Avg loss: 0.646344
 Epoch: 21 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=340.9587874488409: Accuracy: 77.8%, Avg loss: 0.635467
 Epoch: 22 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=188.33069585164463: Accuracy: 77.5%, Avg loss: 0.648218
 Epoch: 23 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=207.05321636870573: Accuracy: 78.0%, Avg loss: 0.629977
 Epoch: 24 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=139.5196686243408: Accuracy: 78.1%, Avg loss: 0.631241
 Epoch: 25 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=203.06399964644922: Accuracy: 78.2%, Avg loss: 0.633227
 Epoch: 26 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=146.5300886605258: Accuracy: 78.0%, Avg loss: 0.629009
 Epoch: 27 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=205.68029840645374: Accuracy: 78.6%, Avg loss: 0.622984
 Epoch: 28 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=176.2956193659258: Accuracy: 78.3%, Avg loss: 0.635503
 Epoch: 29 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=19.994263359691743: Accuracy: 78.4%, Avg loss: 0.632026
 Epoch: 30 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=302.3854164920045: Accuracy: 78.5%, Avg loss: 0.624561
 Epoch: 31 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=218.56986942592113: Accuracy: 78.7%, Avg loss: 0.627660
 Epoch: 32 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=348.5468493629119: Accuracy: 78.4%, Avg loss: 0.634805
 Epoch: 33 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=189.9237852018357: Accuracy: 78.8%, Avg loss: 0.612811
 Epoch: 34 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=228.42813272735083: Accuracy: 79.2%, Avg loss: 0.609893
 Epoch: 35 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=196.98657634094474: Accuracy: 78.6%, Avg loss: 0.621140
 Epoch: 36 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=236.8959123186471: Accuracy: 78.9%, Avg loss: 0.611115
 Epoch: 37 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=197.2610868388896: Accuracy: 78.8%, Avg loss: 0.615486
 Epoch: 38 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=6.434925133231366: Accuracy: 78.7%, Avg loss: 0.629720
 Epoch: 39 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=63.784692637769496: Accuracy: 79.2%, Avg loss: 0.605861
 Epoch: 40 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=86.10215300137764: Accuracy: 78.6%, Avg loss: 0.621939
 Epoch: 41 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=222.64815366633349: Accuracy: 79.2%, Avg loss: 0.601240
 Epoch: 42 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=346.64581885166206: Accuracy: 79.3%, Avg loss: 0.621448
 Epoch: 43 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=245.67280141979467: Accuracy: 78.7%, Avg loss: 0.619204
 Epoch: 44 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=299.96325593769615: Accuracy: 79.5%, Avg loss: 0.597991
 Epoch: 45 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=78.52790892387186: Accuracy: 79.2%, Avg loss: 0.606973
 Epoch: 46 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=73.90856235767433: Accuracy: 79.4%, Avg loss: 0.606034
 Epoch: 47 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=327.00060305445623: Accuracy: 79.2%, Avg loss: 0.616675
 Epoch: 48 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=123.49425052527964: Accuracy: 79.1%, Avg loss: 0.604402
 Epoch: 49 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=133.3983321527365: Accuracy: 79.4%, Avg loss: 0.594718
 Epoch: 50 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with angle=116.93875613785582: Accuracy: 79.5%, Avg loss: 0.594277
Done!


Optimize in the alpha space

In [17]:
# model to eval mode and move to cpu
model.eval()
model.cpu()

# freeze Ws
for param in model.parameters():
  param.requires_grad = False

In [18]:
# execute only if you wish to test with a different batch_size (for example, batch_size=1 ... takes long!)
batch_size=64
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

In [19]:
from scipy import optimize

# function to minimize by the basin hopping algorithm
def f(z, *args):
    alpha = transform_angle(((1+z)*180)%360-180)
    X = args[0]
    logits = model(Tensor(X), hyper_x=Tensor(alpha))
    b = (F.softmax(logits, dim=1)) * (-1 * F.log_softmax(logits, dim=1))  # entropy
    return b.sum().numpy()

# given a batch of images find the rotation angle alpha
def findalpha(X):
    # Basin hopping algorithm
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.basinhopping.html
    minimizer_kwargs = {"method": "BFGS", "args":X}
    res = optimize.basinhopping(f, 0.0, minimizer_kwargs=minimizer_kwargs, niter=100, T=5)

    alpha = ((1+res.x[0])*180)%360-180    
    # print("alpha estimate = ", alpha)     # obtained minimum
    # print("fun = ", res.fun)              # function value at minimum
    return alpha

result = 0.0
for _, (X, y) in enumerate(tqdm(test_loader, desc='Testing alpha search')):
    angle = random.uniform(-180, 180)
    # print("=============")
    # print("alpha true = ", angle)
    X = TF.rotate(X, angle)

    alpha = findalpha(X)

    # compute model prediction with the estimated alpha
    logits = model(X, hyper_x=transform_angle(alpha))
    # y is the true label --> calculate accuracy
    correct = (logits.argmax(1) == y).type(torch.float).sum().item() / batch_size
    # print(f"accuracy = {(100*correct):>0.1f}")
    result += correct

result /= len(test_loader.dataset) / batch_size
print(f"Test accuracy: {(100*result):>0.1f}%")

print("Done!")

Testing alpha search:   0%|          | 0/157 [00:00<?, ?it/s]



Test accuracy: 79.7%
Done!


# One4All

The code below is to compare the performance of SCN and SCN+findalpha() to One4All.

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MLPB(nn.Module):
    def __init__(self, n_layers, n_units, n_channels, n_classes=10):
        super(MLPB, self).__init__()
        mid_layers = []
        mid_layers.extend([nn.Flatten(), nn.Linear(16 * 16 * n_channels, n_units), nn.ReLU()])
        for _ in range(n_layers-1):
            mid_layers.extend([
                nn.Linear(n_units, n_units),
                nn.BatchNorm1d(n_units),
                nn.ReLU(),
            ])
        mid_layers.extend([nn.Linear(n_units, n_classes)])
        self.linear_relu_stack = nn.Sequential(*mid_layers)

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model_one4all = MLPB(nlayers, width, 1, n_classes=10).to(device)
print(model_one4all)
print(count_model_parameters(model_one4all))
model_one4all.to(device)

MLPB(
  (linear_relu_stack): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=256, out_features=32, bias=True)
    (2): ReLU()
    (3): Linear(in_features=32, out_features=10, bias=True)
  )
)
8554


MLPB(
  (linear_relu_stack): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=256, out_features=32, bias=True)
    (2): ReLU()
    (3): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [21]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_one4all.parameters())
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0)

def train(dataloader, model_one4all, loss_fn, optimizer):
    for batch, (X, y) in enumerate(tqdm(dataloader, desc='Training')):
        X, y = X.to(device), y.to(device)
        angle = random.uniform(0, 360)
        X = TF.rotate(X, angle)

        pred = model_one4all(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # scheduler.step()

def validate(dataloader, model_one4all, loss_fn):
    model_one4all.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            angle = random.uniform(0, 360)
            X = TF.rotate(X, angle)

            pred = model_one4all(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= len(dataloader)
    correct /= len(dataloader.dataset)
    print(f"Test with translation={angle}: Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return correct, test_loss

for t in range(epochs):
    print(f"=================\n Epoch: {t + 1} \n=================")
    train(train_loader, model_one4all, loss_fn, optimizer)
    test_acc, test_loss = validate(test_loader, model_one4all, loss_fn)
print("Done!")

 Epoch: 1 


Training:   0%|          | 0/938 [00:00<?, ?it/s]



Test with translation=148.6699231021879: Accuracy: 56.8%, Avg loss: 1.246317
 Epoch: 2 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=274.2432721016044: Accuracy: 59.7%, Avg loss: 1.123733
 Epoch: 3 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=246.58986518560596: Accuracy: 61.3%, Avg loss: 1.082704
 Epoch: 4 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=46.55002004759874: Accuracy: 62.8%, Avg loss: 1.049550
 Epoch: 5 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=288.337645350292: Accuracy: 63.0%, Avg loss: 1.043450
 Epoch: 6 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=53.84745011025287: Accuracy: 64.1%, Avg loss: 1.012792
 Epoch: 7 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=266.4685200405625: Accuracy: 65.3%, Avg loss: 0.994337
 Epoch: 8 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=356.76022124100376: Accuracy: 65.9%, Avg loss: 0.978645
 Epoch: 9 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=336.30549577246614: Accuracy: 66.5%, Avg loss: 0.953063
 Epoch: 10 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=343.3912214357059: Accuracy: 65.9%, Avg loss: 0.949438
 Epoch: 11 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=32.06808372379711: Accuracy: 66.5%, Avg loss: 0.947336
 Epoch: 12 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=333.32990199389985: Accuracy: 66.7%, Avg loss: 0.930963
 Epoch: 13 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=243.2936133929419: Accuracy: 66.6%, Avg loss: 0.941560
 Epoch: 14 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=246.48310415211364: Accuracy: 67.3%, Avg loss: 0.923504
 Epoch: 15 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=65.29770560937592: Accuracy: 67.8%, Avg loss: 0.907805
 Epoch: 16 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=359.7874142220049: Accuracy: 66.5%, Avg loss: 0.925322
 Epoch: 17 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=185.08470416706677: Accuracy: 66.7%, Avg loss: 0.918974
 Epoch: 18 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=252.63890319289266: Accuracy: 67.5%, Avg loss: 0.906269
 Epoch: 19 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=134.55842844879322: Accuracy: 67.8%, Avg loss: 0.899043
 Epoch: 20 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=33.25624478475818: Accuracy: 67.7%, Avg loss: 0.894172
 Epoch: 21 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=192.37070218562383: Accuracy: 68.2%, Avg loss: 0.883708
 Epoch: 22 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=45.43742899190275: Accuracy: 68.4%, Avg loss: 0.885374
 Epoch: 23 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=51.49079910774365: Accuracy: 68.0%, Avg loss: 0.894079
 Epoch: 24 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=50.55997244710361: Accuracy: 67.8%, Avg loss: 0.892371
 Epoch: 25 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=240.7784699882861: Accuracy: 67.5%, Avg loss: 0.890058
 Epoch: 26 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=289.7064292153568: Accuracy: 67.7%, Avg loss: 0.892707
 Epoch: 27 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=268.9017175383509: Accuracy: 68.1%, Avg loss: 0.884577
 Epoch: 28 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=34.47672291440474: Accuracy: 68.5%, Avg loss: 0.882855
 Epoch: 29 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=45.63869398337063: Accuracy: 68.7%, Avg loss: 0.873787
 Epoch: 30 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=87.86993594832612: Accuracy: 68.3%, Avg loss: 0.882127
 Epoch: 31 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=166.6944247805131: Accuracy: 69.0%, Avg loss: 0.859663
 Epoch: 32 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=182.67972919278313: Accuracy: 68.9%, Avg loss: 0.866083
 Epoch: 33 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=80.20598449257301: Accuracy: 69.4%, Avg loss: 0.852871
 Epoch: 34 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=33.80901475424717: Accuracy: 68.9%, Avg loss: 0.875030
 Epoch: 35 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=268.97137313848435: Accuracy: 68.9%, Avg loss: 0.857375
 Epoch: 36 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=211.51461270828372: Accuracy: 69.3%, Avg loss: 0.862639
 Epoch: 37 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=273.5854858759137: Accuracy: 69.1%, Avg loss: 0.861114
 Epoch: 38 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=324.803931734559: Accuracy: 68.7%, Avg loss: 0.862220
 Epoch: 39 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=13.69377232805373: Accuracy: 69.3%, Avg loss: 0.853553
 Epoch: 40 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=192.06951493791675: Accuracy: 69.9%, Avg loss: 0.850968
 Epoch: 41 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=181.5541099737185: Accuracy: 69.5%, Avg loss: 0.853157
 Epoch: 42 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=189.65116496790776: Accuracy: 69.3%, Avg loss: 0.850577
 Epoch: 43 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=142.65409864499347: Accuracy: 69.4%, Avg loss: 0.853382
 Epoch: 44 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=160.35602804027454: Accuracy: 69.4%, Avg loss: 0.846089
 Epoch: 45 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=309.1857414315375: Accuracy: 69.9%, Avg loss: 0.846480
 Epoch: 46 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=10.780590261302393: Accuracy: 69.6%, Avg loss: 0.846164
 Epoch: 47 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=267.8723213181026: Accuracy: 69.4%, Avg loss: 0.856458
 Epoch: 48 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=173.41471443397316: Accuracy: 70.0%, Avg loss: 0.835227
 Epoch: 49 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=201.4077664315325: Accuracy: 69.9%, Avg loss: 0.846756
 Epoch: 50 


Training:   0%|          | 0/938 [00:00<?, ?it/s]

Test with translation=246.5534410344573: Accuracy: 69.7%, Avg loss: 0.839129
Done!
