In [11]:
!pip install portalocker
!pip install torchmetrics



In [12]:
import argparse
import logging
import time

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe
from tqdm import tqdm

torch.autograd.set_detect_anomaly(True)

FILL = '_FILL_'

### Information
- torchtext repo: https://github.com/pytorch/text/tree/main/torchtext
- torchtext documentation: https://pytorch.org/text/stable/index.html

### Constants

In [13]:
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMBED_DIM = 300
LR = 1.0
BATCH_SIZE = 128
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE

In [14]:
DEVICE

'cuda'

### Get the tokenizer
- Use the WordLevel tokenizer.


In [15]:
# Get basic tokenizer
basic_english_tokenizer = get_tokenizer('basic_english')

In [16]:
basic_english_tokenizer("This is some text ...")

['this', 'is', 'some', 'text', '.', '.', '.']

In [17]:
# Needed later; set this to the tokenizer above
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary

In [18]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield TOKENIZER(text)

In [19]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
VOCAB = build_vocab_from_iterator(yield_tokens(train_iter), specials=('<pad>', '<unk>'))

# Make the default index the same as that of the unk_token.
VOCAB.set_default_index(VOCAB['<unk>'])
VOCAB_SIZE = len(VOCAB)

### Get GLOVE embeddings

In [20]:
# Get the GloVe embeddings; this will be slow
GLOVE = GloVe()#FILL

In [21]:
len(GLOVE)
#GLOVE.vectors.shape

2196017

### Helper functions

In [22]:
def text_pipeline(text):
    return VOCAB(TOKENIZER(text))

def label_pipeline(label):
    return int(label) - 1

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [23]:
# As before, loop through the batch and transform into tensors
"""def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        FILL

        # Return a list of ints
        processed_text = FILL
        # Append to text_list
        FILL

    # Pad and make into tensors as needed
    label_list = FILL
    text_list = FILL

    return label_list.to(DEVICE), text_list.to(DEVICE)"""
def collate_batch(batch):
    label_list, text_list = [], []
    # For each label and text in a batch, transform to a tensor of labels
    # And a tensor of tokens for each word
    # You need text_pipeline here and
    for (_label, _text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        label_list.append(label_pipeline(_label))
        # Return a list of ints
        # This basically returns a tensor of the token ids and then puts them into text_list
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text.clone().detach())

    # Make a tensor from the label_list
    label_tensor = torch.tensor(label_list)
    # Pad the sequence list and return a tensor
    text_tensor = pad_sequence(text_list)

    return label_tensor.to(DEVICE), text_tensor.to(DEVICE)

In [24]:
def batch_test():

    x = list([(label,_) for (label, _) in train_iter])
    x = x[0:15]
    label_list, text_list = [], []
    # For each label and text in a batch, transform to a tensor of labels
    # And a tensor of tokens for each word
    # You need text_pipeline here and
    #print(x)
    for (_label, _text) in x:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        label_list.append(label_pipeline(_label))
        # Return a list of ints
        # This basically returns a tensor of the token ids and then puts them into text_list
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)

        text_list.append(processed_text.clone().detach())

    # Make a tensor from the label_list
    label_tensor = torch.tensor(label_list)
    # Pad the sequence list and return a tensor
    text_tensor = pad_sequence(text_list)

    return label_tensor, text_tensor

batch_test()

(tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 tensor([[  432, 15875,    59,  ..., 38488,   203,   273],
         [  426,  1073,     9,  ...,   411,  1644,  1198],
         [    2,   855,   348,  ...,   394,  1605,  8727],
         ...,
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0]]))

### Get the data

In [25]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
num_class = len(set([label for (label, _) in train_iter]))
# What are the classes?
print(f"The number of classes is {num_class} ...")

The number of classes is 4 ...


In [26]:
embedding = nn.Embedding(
            len(VOCAB),
            EMBED_DIM,
            padding_idx=PADDING_IDX
        )
embedding.to(DEVICE)

Embedding(95812, 300, padding_idx=0)

### Set up the model

Good reference on this type of model
- CNN for Sentence Classification: https://arxiv.org/pdf/1408.5882.pdf

You can build CNN models with either CNN1d or CNN2d.

In [27]:
class CNN1dTextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_class,
        embed_dim = 300,
        use_pretrained = True,
        fine_tune_embeddings = True,
        debug = False
    ):

        super(CNN1dTextClassificationModel, self).__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=PADDING_IDX
        )
        self.embedding.to(DEVICE)

        if use_pretrained:
            # Set the embeddings to not requiring gradients since we'll try and modify
            self.embedding.weight.requires_grad = False
            for i in range(vocab_size):
                # Get the token for the index i
                token = VOCAB.lookup_token(i)
                # Modify the embedding for index i by the embedding for that token ; How exacty do we modify this emebedding ?
                # Do this only if token is in the stoi dictionary for GLOVE
                if token in GLOVE.stoi:
                  self.embedding.weight[i,:] = GLOVE.get_vecs_by_tokens(TOKENIZER(token))

            self.embedding.weight.requires_grad = True#FILL
        else:
            # Otherwise, initialize the weights as specified below
            self.init_weights()

        # If weights do not get changed, turn off gradients for the GloVe embeddings
        if not fine_tune_embeddings:
            self.embedding.weight.requires_grad = False

        # Define a Conv1d layer that collapses all the channels and does not collapse the time dimension ; does the imply that dilation remains at size 1?
        self.cnn1 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=1, kernel_size=1, stride=1)

        # Define 3 Conv1d layers each having 1 filter and kernel sizes 2, 3 and 4
        self.cnn2 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=1, kernel_size=2, stride=1)
        self.cnn3 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=1, kernel_size=3, stride=1)
        self.cnn4 = nn.Conv1d(in_channels=EMBED_DIM, out_channels=1, kernel_size=4, stride=1)  # figure out in_channels like depending on the shape we expect to get in teh forward algorithm
        """
        self.cnn2 = nn.Conv1d(in_channels=1 out_channels=1, kernel_size=2, stride=1)
        self.cnn3 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=1)
        self.cnn4 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=4, stride=1)

        """

        # A linear map from some dimensions to num_class (you need to figure it out)

        self.fc = nn.Linear(3, num_class, device = DEVICE) #check
        #torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
        self.debug = debug

        """"Why dont we have a self.maxpooling1 set to nn.Maxpool1d(kernel_size= <somesize>)"""

        self.maxpool1 = nn.MaxPool1d(kernel_size=1, stride=1)
        self.maxpool2 = nn.MaxPool1d(kernel_size=2,stride=1)
        self.maxpool3 = nn.MaxPool1d(kernel_size=3,stride=1)
        self.maxpool4 = nn.MaxPool1d(kernel_size=4,stride=1)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.relu3 = nn.ReLU()


    def init_weights(self):
        initrange = 0.5
        # Initialize the embedding weight matrix to uniform between the [-0.5, 0.5]
        self.weight.data.uniform_(-initrange, initrange)
        # Initialize the weight matrix of fc to uniform between the [-0.5, 0.5]
        self.fc.weight.data.uniform_(-initrange, initrange)
        # Initialize the bias for fc to zero
        self.fc.bias.data.zero_()

    # B = batch_size, L = sequence length, D = vector dimension
    def forward(self, text):

        # B X L X D
        # Get the embeddings for the text passed in
        embedded = self.embedding(text)

        if self.debug:
            print('embedding', embedded.shape)
        #print(embedded.shape)
        # B X D X L
        # Transpose the embedding above as needed
        embedded = torch.transpose(embedded,0,1)
        #print(embedded.shape)
        embedded = torch.transpose(embedded,1,2)
        #print(embedded.shape)
        # B X 1 X L
        # Pass through cnn1
        cnn1 = self.cnn1(embedded)
        if self.debug:
            print('cnn1', cnn1.shape)

        # B X 1
        # Take Max pooling over time
        #max1 = self.maxpool1(cnn1[:,:]) drop a dimension why would we do this how does it help us get a prediction
        cnn1 = self.maxpool1(cnn1)
        if self.debug:
          print('cnn1 after max pool', cnn1.shape)
        #print('cnn1 after max pool', cnn1.shape)
        # B X 1 X L - 1
        # Pass through cnn2 and add a RELU
        cnn2 = self.cnn2(embedded)
        if self.debug:
            print('cnn2', cnn2.shape) # define relu1
        cnn2 = self.relu1(cnn2)
        if self.debug:
            print('cnn2', cnn2.shape)
        # B X 1 X L - 2
        # Pass through cnn3 and add a RELU
        cnn3 = self.cnn3(embedded)
        if self.debug:
            print('cnn3', cnn3.shape) #define relu 2
        cnn3 = self.relu2(cnn3)
        if self.debug:
            print('cnn3', cnn3.shape)
        # B X 1 X L - 3
        # Pass through cnn4 and add a RELU
        cnn4 = self.cnn4(embedded)#FILL
        if self.debug:
            print('cnn4', cnn4.shape) #define relu 3
        cnn4 = self.relu3(cnn4)
        if self.debug:
            print('cnn4', cnn4.shape)

        # B X 1 in all cases
        # Apply max pooling to each of cnn2, cnn3 and cnn4 #define maxpooling for cnn2, cnn3, and cnn4
        cnn2 = self.maxpool2(cnn2) #FILL
        cnn3 = self.maxpool3(cnn3)#FILL
        cnn4 = self.maxpool4(cnn4) #FILL

        # B X 1 in all cases
        # Apply max pooling over time
        if self.debug:
            print('cnn2 after max', cnn2.shape)

        # Add to each cnn2, 3, 4 a skip connection to cnn1 and average the results
        #What is a skip connection
        cnn2 = (cnn2 + cnn1[:, :, :-2]).mean(axis=2)#FILL The main question is why we are doing this skip step
        cnn3 = (cnn3 + + cnn1[:, :, :-4]).mean(axis=2)#FILL
        cnn4 = (cnn4 + cnn1[:, :, :-6]).mean(axis=2)#FILL lets start by reviewing what exactly skip connection
        if self.debug:
            print('cnn2 after skip connection', cnn2.shape)
        # B X 3
        # Concatenate the above
        cnn_concat = torch.concat((cnn2, cnn3, cnn4), 1) #FILL #concat the maxpooling over [cnn2, cnn3, cnn4 ] apparently they are 3 column vectors with 128 entries or BATCH_SIZE entries 128 x 3
        if self.debug:
            print('cnn concat', cnn_concat.shape)
            # Set the debug to False after the first forward pass
            self.debug = False

        # Pass through an appropriate Linear layer to get the right dimensions needed
        out = self.fc(cnn_concat)#FILL
        #I think it means passing it through self,fc

        return out

### Set up the model

In [28]:
# If this is True, we will initialize the Embedding layer with GLOVE
USE_PRETRAINED = True,

# If this is True, we will allow for gradient updates on the nn.Embedding layer
FINE_TUNE_EMBEDDINGS = True

# Set the loss appropriately
criterion = nn.CrossEntropyLoss()

model = CNN1dTextClassificationModel(len(VOCAB), num_class,
        embed_dim = 300,
        use_pretrained = USE_PRETRAINED,
        fine_tune_embeddings = FINE_TUNE_EMBEDDINGS,
        debug = False).to(DEVICE)

# Set the optimizer to SGD
# Add an L2 regularizer of 0.00001
optimizer = torch.optim.SGD(model.parameters(),lr=LR)


# Set the scheduler to StepLR with gamma=0.1 and step_size = 1.0
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1.0, gamma = 0.1)
# study more deeply what the scheduler actually does for optimization steps

### Set up the data

In [29]:
train_iter, test_iter = DATASETS[DATASET]()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])


train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_batch
)

### Train the model

In [30]:
def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    total_loss, total_batches = 0.0, 0
    total_zero_gradients_percentage = []
    log_interval = 10

    for idx, (label, text) in tqdm(enumerate(dataloader)):
        # Set gradients to zero
        optimizer.zero_grad()

        # Get the predictions
        predicted_label = model(text)

        # Get the loss
        loss = criterion(predicted_label, label) #what loss is used for this #FILL

        # Do back propagation and get the gradients
        #FILL
        loss.backward()

        # Get the loss per batch and the number of batches
        with torch.no_grad():
            total_loss += loss.item()#FILL
            total_batches += 1#FILL

        # Loop through all the parameters
        # Specifically, for this batch, get the percentage of zero gradients across all parameters
        # Append this to the list above which will print out the total every 10 batches
        total_nonzero_gradients = 0.0
        total_param_count = 0
        #print #FILL
        for x in model.parameters():
          if x.requires_grad:
            total_nonzero_gradients += torch.count_nonzero(x.grad)
            total_param_count += x.grad.numel()
        # Append to total_zero_gradients_percentage
        total_zero_gradients_percentage.append(((total_param_count-total_nonzero_gradients)/total_param_count))#FILL


        # Clip the gradient at ?? Should we use 10.0 or 0.1 with the learning rate we picked and the default notebook setting?
        # Use the above loop to help you figure this out
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)#FILL

        # Do an optimization step
        optimizer.step()#FILL

        # Get the accuracy
        total_acc += (predicted_label.argmax(1) == label).sum().item()#FILL
        total_count += label.size(0) #FILL

        # Log results
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} "
                "| loss {:8.3f} "
                "| zero gradients percentage {:8.3f}".format(
                    epoch, idx,
                    len(dataloader),
                    total_acc / total_count,
                    total_loss / total_batches,
                    torch.tensor(total_zero_gradients_percentage).mean().item()
                    )
            )
            # Reset variables as needed
            total_acc, total_count = 0, 0
            total_loss, total_batches = 0.0, 0
            total_zero_gradients = []

In [31]:
def evaluate(dataloader, model, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    total_loss = 0.0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(input=predicted_label, target=label)
            total_acc += (predicted_label.argmax(1) == label).sum().item() #FILL
            total_count +=  label.size(0) #FILL
            total_loss += loss.item()#FILL
    return total_acc / total_count, total_loss / total_count

In [33]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val, loss_val = evaluate(valid_dataloader, model, criterion)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch,
            time.time() - epoch_start_time,
            accu_val,
            loss_val
            )
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test, loss_test = evaluate(test_dataloader, model, criterion)
print("test accuracy {:8.3f}".format(accu_test))
print("test loss {:8.3f}".format(loss_test))

13it [00:00, 19.60it/s]

| epoch   1 |    10/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.978


25it [00:01, 19.66it/s]

| epoch   1 |    20/  891 batches | accuracy    0.906 | loss    0.312 | zero gradients percentage    0.978


33it [00:01, 20.14it/s]

| epoch   1 |    30/  891 batches | accuracy    0.899 | loss    0.318 | zero gradients percentage    0.978


43it [00:02, 20.49it/s]

| epoch   1 |    40/  891 batches | accuracy    0.875 | loss    0.385 | zero gradients percentage    0.978


55it [00:02, 20.59it/s]

| epoch   1 |    50/  891 batches | accuracy    0.889 | loss    0.348 | zero gradients percentage    0.978


64it [00:03, 19.99it/s]

| epoch   1 |    60/  891 batches | accuracy    0.888 | loss    0.344 | zero gradients percentage    0.978


75it [00:03, 20.09it/s]

| epoch   1 |    70/  891 batches | accuracy    0.895 | loss    0.352 | zero gradients percentage    0.978


84it [00:04, 19.42it/s]

| epoch   1 |    80/  891 batches | accuracy    0.881 | loss    0.374 | zero gradients percentage    0.978


93it [00:04, 19.68it/s]

| epoch   1 |    90/  891 batches | accuracy    0.887 | loss    0.329 | zero gradients percentage    0.978


104it [00:05, 20.00it/s]

| epoch   1 |   100/  891 batches | accuracy    0.877 | loss    0.390 | zero gradients percentage    0.978


113it [00:05, 20.48it/s]

| epoch   1 |   110/  891 batches | accuracy    0.885 | loss    0.361 | zero gradients percentage    0.978


125it [00:06, 19.94it/s]

| epoch   1 |   120/  891 batches | accuracy    0.890 | loss    0.372 | zero gradients percentage    0.978


133it [00:06, 19.46it/s]

| epoch   1 |   130/  891 batches | accuracy    0.886 | loss    0.342 | zero gradients percentage    0.978


144it [00:07, 19.90it/s]

| epoch   1 |   140/  891 batches | accuracy    0.891 | loss    0.354 | zero gradients percentage    0.978


154it [00:07, 20.35it/s]

| epoch   1 |   150/  891 batches | accuracy    0.896 | loss    0.350 | zero gradients percentage    0.978


165it [00:08, 20.13it/s]

| epoch   1 |   160/  891 batches | accuracy    0.878 | loss    0.391 | zero gradients percentage    0.978


174it [00:08, 20.71it/s]

| epoch   1 |   170/  891 batches | accuracy    0.883 | loss    0.357 | zero gradients percentage    0.978


183it [00:09, 19.93it/s]

| epoch   1 |   180/  891 batches | accuracy    0.892 | loss    0.375 | zero gradients percentage    0.978


194it [00:09, 20.28it/s]

| epoch   1 |   190/  891 batches | accuracy    0.902 | loss    0.313 | zero gradients percentage    0.978


203it [00:10, 20.57it/s]

| epoch   1 |   200/  891 batches | accuracy    0.892 | loss    0.351 | zero gradients percentage    0.978


214it [00:10, 16.59it/s]

| epoch   1 |   210/  891 batches | accuracy    0.897 | loss    0.316 | zero gradients percentage    0.978


222it [00:11, 14.28it/s]

| epoch   1 |   220/  891 batches | accuracy    0.891 | loss    0.352 | zero gradients percentage    0.978


232it [00:12, 15.01it/s]

| epoch   1 |   230/  891 batches | accuracy    0.875 | loss    0.410 | zero gradients percentage    0.978


244it [00:12, 16.30it/s]

| epoch   1 |   240/  891 batches | accuracy    0.900 | loss    0.314 | zero gradients percentage    0.978


254it [00:13, 15.99it/s]

| epoch   1 |   250/  891 batches | accuracy    0.908 | loss    0.303 | zero gradients percentage    0.978


264it [00:14, 15.94it/s]

| epoch   1 |   260/  891 batches | accuracy    0.896 | loss    0.341 | zero gradients percentage    0.978


274it [00:14, 14.10it/s]

| epoch   1 |   270/  891 batches | accuracy    0.891 | loss    0.353 | zero gradients percentage    0.978


282it [00:15, 14.61it/s]

| epoch   1 |   280/  891 batches | accuracy    0.878 | loss    0.351 | zero gradients percentage    0.978


293it [00:16, 18.38it/s]

| epoch   1 |   290/  891 batches | accuracy    0.884 | loss    0.373 | zero gradients percentage    0.978


303it [00:16, 18.72it/s]

| epoch   1 |   300/  891 batches | accuracy    0.888 | loss    0.374 | zero gradients percentage    0.978


315it [00:17, 19.20it/s]

| epoch   1 |   310/  891 batches | accuracy    0.892 | loss    0.392 | zero gradients percentage    0.978


323it [00:17, 19.10it/s]

| epoch   1 |   320/  891 batches | accuracy    0.897 | loss    0.305 | zero gradients percentage    0.978


334it [00:18, 19.32it/s]

| epoch   1 |   330/  891 batches | accuracy    0.903 | loss    0.332 | zero gradients percentage    0.978


345it [00:18, 19.44it/s]

| epoch   1 |   340/  891 batches | accuracy    0.883 | loss    0.348 | zero gradients percentage    0.978


354it [00:19, 19.64it/s]

| epoch   1 |   350/  891 batches | accuracy    0.868 | loss    0.388 | zero gradients percentage    0.978


364it [00:19, 19.95it/s]

| epoch   1 |   360/  891 batches | accuracy    0.885 | loss    0.393 | zero gradients percentage    0.978


374it [00:20, 19.93it/s]

| epoch   1 |   370/  891 batches | accuracy    0.896 | loss    0.321 | zero gradients percentage    0.978


383it [00:20, 20.01it/s]

| epoch   1 |   380/  891 batches | accuracy    0.884 | loss    0.390 | zero gradients percentage    0.978


395it [00:21, 20.42it/s]

| epoch   1 |   390/  891 batches | accuracy    0.901 | loss    0.315 | zero gradients percentage    0.978


404it [00:21, 19.99it/s]

| epoch   1 |   400/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.978


413it [00:22, 20.11it/s]

| epoch   1 |   410/  891 batches | accuracy    0.902 | loss    0.322 | zero gradients percentage    0.978


425it [00:22, 19.63it/s]

| epoch   1 |   420/  891 batches | accuracy    0.884 | loss    0.379 | zero gradients percentage    0.978


434it [00:23, 20.06it/s]

| epoch   1 |   430/  891 batches | accuracy    0.866 | loss    0.385 | zero gradients percentage    0.978


445it [00:23, 19.96it/s]

| epoch   1 |   440/  891 batches | accuracy    0.911 | loss    0.293 | zero gradients percentage    0.978


454it [00:24, 19.40it/s]

| epoch   1 |   450/  891 batches | accuracy    0.910 | loss    0.315 | zero gradients percentage    0.978


463it [00:24, 19.39it/s]

| epoch   1 |   460/  891 batches | accuracy    0.891 | loss    0.346 | zero gradients percentage    0.978


473it [00:25, 19.72it/s]

| epoch   1 |   470/  891 batches | accuracy    0.901 | loss    0.316 | zero gradients percentage    0.978


482it [00:25, 18.86it/s]

| epoch   1 |   480/  891 batches | accuracy    0.887 | loss    0.377 | zero gradients percentage    0.978


494it [00:26, 14.91it/s]

| epoch   1 |   490/  891 batches | accuracy    0.899 | loss    0.320 | zero gradients percentage    0.978


502it [00:27, 13.35it/s]

| epoch   1 |   500/  891 batches | accuracy    0.889 | loss    0.340 | zero gradients percentage    0.978


512it [00:27, 14.29it/s]

| epoch   1 |   510/  891 batches | accuracy    0.883 | loss    0.361 | zero gradients percentage    0.978


524it [00:28, 14.55it/s]

| epoch   1 |   520/  891 batches | accuracy    0.891 | loss    0.345 | zero gradients percentage    0.978


532it [00:29, 15.02it/s]

| epoch   1 |   530/  891 batches | accuracy    0.893 | loss    0.316 | zero gradients percentage    0.978


542it [00:29, 12.89it/s]

| epoch   1 |   540/  891 batches | accuracy    0.893 | loss    0.326 | zero gradients percentage    0.978


555it [00:30, 16.79it/s]

| epoch   1 |   550/  891 batches | accuracy    0.899 | loss    0.334 | zero gradients percentage    0.978


565it [00:31, 19.48it/s]

| epoch   1 |   560/  891 batches | accuracy    0.888 | loss    0.351 | zero gradients percentage    0.978


574it [00:31, 20.44it/s]

| epoch   1 |   570/  891 batches | accuracy    0.887 | loss    0.325 | zero gradients percentage    0.978


583it [00:32, 19.71it/s]

| epoch   1 |   580/  891 batches | accuracy    0.898 | loss    0.345 | zero gradients percentage    0.978


595it [00:32, 20.51it/s]

| epoch   1 |   590/  891 batches | accuracy    0.887 | loss    0.385 | zero gradients percentage    0.978


603it [00:33, 19.74it/s]

| epoch   1 |   600/  891 batches | accuracy    0.887 | loss    0.367 | zero gradients percentage    0.978


614it [00:33, 19.23it/s]

| epoch   1 |   610/  891 batches | accuracy    0.881 | loss    0.353 | zero gradients percentage    0.978


624it [00:34, 19.92it/s]

| epoch   1 |   620/  891 batches | accuracy    0.911 | loss    0.332 | zero gradients percentage    0.978


634it [00:34, 20.10it/s]

| epoch   1 |   630/  891 batches | accuracy    0.893 | loss    0.303 | zero gradients percentage    0.978


644it [00:35, 18.83it/s]

| epoch   1 |   640/  891 batches | accuracy    0.885 | loss    0.384 | zero gradients percentage    0.978


654it [00:35, 19.03it/s]

| epoch   1 |   650/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


665it [00:36, 19.69it/s]

| epoch   1 |   660/  891 batches | accuracy    0.880 | loss    0.348 | zero gradients percentage    0.978


673it [00:36, 18.54it/s]

| epoch   1 |   670/  891 batches | accuracy    0.894 | loss    0.327 | zero gradients percentage    0.978


685it [00:37, 19.19it/s]

| epoch   1 |   680/  891 batches | accuracy    0.898 | loss    0.329 | zero gradients percentage    0.978


694it [00:37, 19.61it/s]

| epoch   1 |   690/  891 batches | accuracy    0.905 | loss    0.295 | zero gradients percentage    0.978


705it [00:38, 19.55it/s]

| epoch   1 |   700/  891 batches | accuracy    0.892 | loss    0.340 | zero gradients percentage    0.978


713it [00:38, 20.11it/s]

| epoch   1 |   710/  891 batches | accuracy    0.899 | loss    0.300 | zero gradients percentage    0.978


724it [00:39, 19.23it/s]

| epoch   1 |   720/  891 batches | accuracy    0.895 | loss    0.324 | zero gradients percentage    0.978


733it [00:39, 19.11it/s]

| epoch   1 |   730/  891 batches | accuracy    0.898 | loss    0.316 | zero gradients percentage    0.978


744it [00:40, 18.93it/s]

| epoch   1 |   740/  891 batches | accuracy    0.890 | loss    0.330 | zero gradients percentage    0.978


752it [00:40, 14.34it/s]

| epoch   1 |   750/  891 batches | accuracy    0.895 | loss    0.339 | zero gradients percentage    0.978


762it [00:41, 14.16it/s]

| epoch   1 |   760/  891 batches | accuracy    0.882 | loss    0.371 | zero gradients percentage    0.978


774it [00:42, 14.35it/s]

| epoch   1 |   770/  891 batches | accuracy    0.886 | loss    0.424 | zero gradients percentage    0.978


782it [00:43, 13.71it/s]

| epoch   1 |   780/  891 batches | accuracy    0.903 | loss    0.304 | zero gradients percentage    0.978


792it [00:43, 13.49it/s]

| epoch   1 |   790/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.978


802it [00:44, 13.54it/s]

| epoch   1 |   800/  891 batches | accuracy    0.893 | loss    0.346 | zero gradients percentage    0.978


814it [00:45, 14.53it/s]

| epoch   1 |   810/  891 batches | accuracy    0.898 | loss    0.314 | zero gradients percentage    0.978


825it [00:45, 19.68it/s]

| epoch   1 |   820/  891 batches | accuracy    0.884 | loss    0.350 | zero gradients percentage    0.978


834it [00:46, 19.65it/s]

| epoch   1 |   830/  891 batches | accuracy    0.878 | loss    0.381 | zero gradients percentage    0.978


844it [00:46, 19.93it/s]

| epoch   1 |   840/  891 batches | accuracy    0.909 | loss    0.317 | zero gradients percentage    0.978


854it [00:47, 19.42it/s]

| epoch   1 |   850/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


863it [00:47, 19.12it/s]

| epoch   1 |   860/  891 batches | accuracy    0.905 | loss    0.317 | zero gradients percentage    0.978


874it [00:48, 18.77it/s]

| epoch   1 |   870/  891 batches | accuracy    0.890 | loss    0.379 | zero gradients percentage    0.978


884it [00:48, 19.61it/s]

| epoch   1 |   880/  891 batches | accuracy    0.887 | loss    0.332 | zero gradients percentage    0.978


891it [00:49, 18.06it/s]


| epoch   1 |   890/  891 batches | accuracy    0.885 | loss    0.381 | zero gradients percentage    0.978
-----------------------------------------------------------
| end of epoch   1 | time: 49.98s | valid accuracy    0.892 
-----------------------------------------------------------


14it [00:00, 19.59it/s]

| epoch   2 |    10/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.978


23it [00:01, 19.54it/s]

| epoch   2 |    20/  891 batches | accuracy    0.906 | loss    0.312 | zero gradients percentage    0.978


32it [00:01, 19.33it/s]

| epoch   2 |    30/  891 batches | accuracy    0.899 | loss    0.318 | zero gradients percentage    0.978


43it [00:02, 19.96it/s]

| epoch   2 |    40/  891 batches | accuracy    0.875 | loss    0.385 | zero gradients percentage    0.978


54it [00:02, 20.39it/s]

| epoch   2 |    50/  891 batches | accuracy    0.889 | loss    0.348 | zero gradients percentage    0.978


63it [00:03, 20.06it/s]

| epoch   2 |    60/  891 batches | accuracy    0.888 | loss    0.344 | zero gradients percentage    0.978


72it [00:03, 20.24it/s]

| epoch   2 |    70/  891 batches | accuracy    0.895 | loss    0.352 | zero gradients percentage    0.978


85it [00:04, 20.04it/s]

| epoch   2 |    80/  891 batches | accuracy    0.881 | loss    0.374 | zero gradients percentage    0.978


94it [00:04, 20.68it/s]

| epoch   2 |    90/  891 batches | accuracy    0.887 | loss    0.329 | zero gradients percentage    0.978


103it [00:05, 19.89it/s]

| epoch   2 |   100/  891 batches | accuracy    0.877 | loss    0.390 | zero gradients percentage    0.978


112it [00:05, 18.63it/s]

| epoch   2 |   110/  891 batches | accuracy    0.885 | loss    0.361 | zero gradients percentage    0.978


124it [00:06, 15.18it/s]

| epoch   2 |   120/  891 batches | accuracy    0.890 | loss    0.372 | zero gradients percentage    0.978


134it [00:07, 15.23it/s]

| epoch   2 |   130/  891 batches | accuracy    0.886 | loss    0.342 | zero gradients percentage    0.978


142it [00:07, 14.86it/s]

| epoch   2 |   140/  891 batches | accuracy    0.891 | loss    0.354 | zero gradients percentage    0.978


152it [00:08, 12.98it/s]

| epoch   2 |   150/  891 batches | accuracy    0.896 | loss    0.350 | zero gradients percentage    0.978


162it [00:09, 13.27it/s]

| epoch   2 |   160/  891 batches | accuracy    0.878 | loss    0.391 | zero gradients percentage    0.978


172it [00:09, 13.55it/s]

| epoch   2 |   170/  891 batches | accuracy    0.883 | loss    0.357 | zero gradients percentage    0.978


185it [00:10, 16.59it/s]

| epoch   2 |   180/  891 batches | accuracy    0.892 | loss    0.375 | zero gradients percentage    0.978


195it [00:11, 19.20it/s]

| epoch   2 |   190/  891 batches | accuracy    0.902 | loss    0.313 | zero gradients percentage    0.978


203it [00:11, 19.38it/s]

| epoch   2 |   200/  891 batches | accuracy    0.892 | loss    0.351 | zero gradients percentage    0.978


213it [00:12, 18.83it/s]

| epoch   2 |   210/  891 batches | accuracy    0.897 | loss    0.316 | zero gradients percentage    0.978


225it [00:12, 20.64it/s]

| epoch   2 |   220/  891 batches | accuracy    0.891 | loss    0.352 | zero gradients percentage    0.978


234it [00:13, 20.52it/s]

| epoch   2 |   230/  891 batches | accuracy    0.875 | loss    0.410 | zero gradients percentage    0.978


243it [00:13, 20.02it/s]

| epoch   2 |   240/  891 batches | accuracy    0.900 | loss    0.314 | zero gradients percentage    0.978


255it [00:14, 20.43it/s]

| epoch   2 |   250/  891 batches | accuracy    0.908 | loss    0.303 | zero gradients percentage    0.978


264it [00:14, 20.29it/s]

| epoch   2 |   260/  891 batches | accuracy    0.896 | loss    0.341 | zero gradients percentage    0.978


274it [00:15, 19.00it/s]

| epoch   2 |   270/  891 batches | accuracy    0.891 | loss    0.353 | zero gradients percentage    0.978


283it [00:15, 19.85it/s]

| epoch   2 |   280/  891 batches | accuracy    0.878 | loss    0.351 | zero gradients percentage    0.978


295it [00:16, 20.09it/s]

| epoch   2 |   290/  891 batches | accuracy    0.884 | loss    0.373 | zero gradients percentage    0.978


304it [00:16, 19.83it/s]

| epoch   2 |   300/  891 batches | accuracy    0.888 | loss    0.374 | zero gradients percentage    0.978


314it [00:17, 19.79it/s]

| epoch   2 |   310/  891 batches | accuracy    0.892 | loss    0.392 | zero gradients percentage    0.978


323it [00:17, 20.02it/s]

| epoch   2 |   320/  891 batches | accuracy    0.897 | loss    0.305 | zero gradients percentage    0.978


334it [00:18, 19.87it/s]

| epoch   2 |   330/  891 batches | accuracy    0.903 | loss    0.332 | zero gradients percentage    0.978


343it [00:18, 19.81it/s]

| epoch   2 |   340/  891 batches | accuracy    0.883 | loss    0.348 | zero gradients percentage    0.978


354it [00:19, 19.18it/s]

| epoch   2 |   350/  891 batches | accuracy    0.868 | loss    0.388 | zero gradients percentage    0.978


364it [00:19, 20.19it/s]

| epoch   2 |   360/  891 batches | accuracy    0.885 | loss    0.393 | zero gradients percentage    0.978


373it [00:20, 19.96it/s]

| epoch   2 |   370/  891 batches | accuracy    0.896 | loss    0.321 | zero gradients percentage    0.978


382it [00:20, 17.28it/s]

| epoch   2 |   380/  891 batches | accuracy    0.884 | loss    0.390 | zero gradients percentage    0.978


394it [00:21, 15.03it/s]

| epoch   2 |   390/  891 batches | accuracy    0.901 | loss    0.315 | zero gradients percentage    0.978


402it [00:22, 14.06it/s]

| epoch   2 |   400/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.978


414it [00:22, 15.27it/s]

| epoch   2 |   410/  891 batches | accuracy    0.902 | loss    0.322 | zero gradients percentage    0.978


424it [00:23, 14.60it/s]

| epoch   2 |   420/  891 batches | accuracy    0.884 | loss    0.379 | zero gradients percentage    0.978


434it [00:24, 14.20it/s]

| epoch   2 |   430/  891 batches | accuracy    0.866 | loss    0.385 | zero gradients percentage    0.978


444it [00:25, 14.40it/s]

| epoch   2 |   440/  891 batches | accuracy    0.911 | loss    0.293 | zero gradients percentage    0.978


454it [00:25, 14.63it/s]

| epoch   2 |   450/  891 batches | accuracy    0.910 | loss    0.315 | zero gradients percentage    0.978


463it [00:26, 17.49it/s]

| epoch   2 |   460/  891 batches | accuracy    0.891 | loss    0.346 | zero gradients percentage    0.978


473it [00:26, 17.90it/s]

| epoch   2 |   470/  891 batches | accuracy    0.901 | loss    0.316 | zero gradients percentage    0.978


484it [00:27, 20.00it/s]

| epoch   2 |   480/  891 batches | accuracy    0.887 | loss    0.377 | zero gradients percentage    0.978


493it [00:27, 20.39it/s]

| epoch   2 |   490/  891 batches | accuracy    0.899 | loss    0.320 | zero gradients percentage    0.978


503it [00:28, 17.55it/s]

| epoch   2 |   500/  891 batches | accuracy    0.889 | loss    0.340 | zero gradients percentage    0.978


513it [00:28, 18.37it/s]

| epoch   2 |   510/  891 batches | accuracy    0.883 | loss    0.361 | zero gradients percentage    0.978


524it [00:29, 19.94it/s]

| epoch   2 |   520/  891 batches | accuracy    0.891 | loss    0.345 | zero gradients percentage    0.978


533it [00:29, 18.92it/s]

| epoch   2 |   530/  891 batches | accuracy    0.893 | loss    0.316 | zero gradients percentage    0.978


543it [00:30, 18.55it/s]

| epoch   2 |   540/  891 batches | accuracy    0.893 | loss    0.326 | zero gradients percentage    0.978


555it [00:31, 18.83it/s]

| epoch   2 |   550/  891 batches | accuracy    0.899 | loss    0.334 | zero gradients percentage    0.978


563it [00:31, 18.99it/s]

| epoch   2 |   560/  891 batches | accuracy    0.888 | loss    0.351 | zero gradients percentage    0.978


574it [00:32, 19.07it/s]

| epoch   2 |   570/  891 batches | accuracy    0.887 | loss    0.325 | zero gradients percentage    0.978


583it [00:32, 19.42it/s]

| epoch   2 |   580/  891 batches | accuracy    0.898 | loss    0.345 | zero gradients percentage    0.978


593it [00:33, 19.91it/s]

| epoch   2 |   590/  891 batches | accuracy    0.887 | loss    0.385 | zero gradients percentage    0.978


604it [00:33, 20.96it/s]

| epoch   2 |   600/  891 batches | accuracy    0.887 | loss    0.367 | zero gradients percentage    0.978


613it [00:34, 18.83it/s]

| epoch   2 |   610/  891 batches | accuracy    0.881 | loss    0.353 | zero gradients percentage    0.978


624it [00:34, 19.54it/s]

| epoch   2 |   620/  891 batches | accuracy    0.911 | loss    0.332 | zero gradients percentage    0.978


633it [00:35, 19.63it/s]

| epoch   2 |   630/  891 batches | accuracy    0.893 | loss    0.303 | zero gradients percentage    0.978


643it [00:35, 18.69it/s]

| epoch   2 |   640/  891 batches | accuracy    0.885 | loss    0.384 | zero gradients percentage    0.978


653it [00:36, 13.40it/s]

| epoch   2 |   650/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


663it [00:37, 14.25it/s]

| epoch   2 |   660/  891 batches | accuracy    0.880 | loss    0.348 | zero gradients percentage    0.978


673it [00:37, 13.12it/s]

| epoch   2 |   670/  891 batches | accuracy    0.894 | loss    0.327 | zero gradients percentage    0.978


683it [00:38, 13.35it/s]

| epoch   2 |   680/  891 batches | accuracy    0.898 | loss    0.329 | zero gradients percentage    0.978


693it [00:39, 13.30it/s]

| epoch   2 |   690/  891 batches | accuracy    0.905 | loss    0.295 | zero gradients percentage    0.978


703it [00:40, 14.02it/s]

| epoch   2 |   700/  891 batches | accuracy    0.892 | loss    0.340 | zero gradients percentage    0.978


713it [00:40, 16.75it/s]

| epoch   2 |   710/  891 batches | accuracy    0.899 | loss    0.300 | zero gradients percentage    0.978


723it [00:41, 18.70it/s]

| epoch   2 |   720/  891 batches | accuracy    0.895 | loss    0.324 | zero gradients percentage    0.978


733it [00:41, 19.36it/s]

| epoch   2 |   730/  891 batches | accuracy    0.898 | loss    0.316 | zero gradients percentage    0.978


743it [00:42, 18.92it/s]

| epoch   2 |   740/  891 batches | accuracy    0.890 | loss    0.330 | zero gradients percentage    0.978


754it [00:42, 19.47it/s]

| epoch   2 |   750/  891 batches | accuracy    0.895 | loss    0.339 | zero gradients percentage    0.978


765it [00:43, 19.73it/s]

| epoch   2 |   760/  891 batches | accuracy    0.882 | loss    0.371 | zero gradients percentage    0.978


775it [00:43, 19.70it/s]

| epoch   2 |   770/  891 batches | accuracy    0.886 | loss    0.424 | zero gradients percentage    0.978


783it [00:44, 19.26it/s]

| epoch   2 |   780/  891 batches | accuracy    0.903 | loss    0.304 | zero gradients percentage    0.978


794it [00:44, 19.33it/s]

| epoch   2 |   790/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.978


804it [00:45, 19.74it/s]

| epoch   2 |   800/  891 batches | accuracy    0.893 | loss    0.346 | zero gradients percentage    0.978


814it [00:45, 20.43it/s]

| epoch   2 |   810/  891 batches | accuracy    0.898 | loss    0.314 | zero gradients percentage    0.978


823it [00:46, 20.56it/s]

| epoch   2 |   820/  891 batches | accuracy    0.884 | loss    0.350 | zero gradients percentage    0.978


834it [00:46, 19.76it/s]

| epoch   2 |   830/  891 batches | accuracy    0.878 | loss    0.381 | zero gradients percentage    0.978


843it [00:47, 19.63it/s]

| epoch   2 |   840/  891 batches | accuracy    0.909 | loss    0.317 | zero gradients percentage    0.978


854it [00:47, 19.53it/s]

| epoch   2 |   850/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


864it [00:48, 20.21it/s]

| epoch   2 |   860/  891 batches | accuracy    0.905 | loss    0.317 | zero gradients percentage    0.978


875it [00:49, 19.59it/s]

| epoch   2 |   870/  891 batches | accuracy    0.890 | loss    0.379 | zero gradients percentage    0.978


883it [00:49, 18.81it/s]

| epoch   2 |   880/  891 batches | accuracy    0.887 | loss    0.332 | zero gradients percentage    0.978


891it [00:49, 17.86it/s]


| epoch   2 |   890/  891 batches | accuracy    0.885 | loss    0.381 | zero gradients percentage    0.978
-----------------------------------------------------------
| end of epoch   2 | time: 50.62s | valid accuracy    0.892 
-----------------------------------------------------------


12it [00:00, 14.09it/s]

| epoch   3 |    10/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.978


22it [00:01, 12.95it/s]

| epoch   3 |    20/  891 batches | accuracy    0.906 | loss    0.312 | zero gradients percentage    0.978


32it [00:02, 14.41it/s]

| epoch   3 |    30/  891 batches | accuracy    0.899 | loss    0.318 | zero gradients percentage    0.978


42it [00:03, 13.36it/s]

| epoch   3 |    40/  891 batches | accuracy    0.875 | loss    0.385 | zero gradients percentage    0.978


52it [00:03, 13.52it/s]

| epoch   3 |    50/  891 batches | accuracy    0.889 | loss    0.348 | zero gradients percentage    0.978


62it [00:04, 14.31it/s]

| epoch   3 |    60/  891 batches | accuracy    0.888 | loss    0.344 | zero gradients percentage    0.978


73it [00:05, 17.47it/s]

| epoch   3 |    70/  891 batches | accuracy    0.895 | loss    0.352 | zero gradients percentage    0.978


83it [00:05, 19.03it/s]

| epoch   3 |    80/  891 batches | accuracy    0.881 | loss    0.374 | zero gradients percentage    0.978


94it [00:06, 19.68it/s]

| epoch   3 |    90/  891 batches | accuracy    0.887 | loss    0.329 | zero gradients percentage    0.978


103it [00:06, 19.43it/s]

| epoch   3 |   100/  891 batches | accuracy    0.877 | loss    0.390 | zero gradients percentage    0.978


113it [00:07, 19.56it/s]

| epoch   3 |   110/  891 batches | accuracy    0.885 | loss    0.361 | zero gradients percentage    0.978


123it [00:07, 19.44it/s]

| epoch   3 |   120/  891 batches | accuracy    0.890 | loss    0.372 | zero gradients percentage    0.978


134it [00:08, 19.41it/s]

| epoch   3 |   130/  891 batches | accuracy    0.886 | loss    0.342 | zero gradients percentage    0.978


143it [00:08, 19.12it/s]

| epoch   3 |   140/  891 batches | accuracy    0.891 | loss    0.354 | zero gradients percentage    0.978


153it [00:09, 18.43it/s]

| epoch   3 |   150/  891 batches | accuracy    0.896 | loss    0.350 | zero gradients percentage    0.978


163it [00:09, 18.81it/s]

| epoch   3 |   160/  891 batches | accuracy    0.878 | loss    0.391 | zero gradients percentage    0.978


174it [00:10, 21.17it/s]

| epoch   3 |   170/  891 batches | accuracy    0.883 | loss    0.357 | zero gradients percentage    0.978


183it [00:10, 20.63it/s]

| epoch   3 |   180/  891 batches | accuracy    0.892 | loss    0.375 | zero gradients percentage    0.978


194it [00:11, 19.63it/s]

| epoch   3 |   190/  891 batches | accuracy    0.902 | loss    0.313 | zero gradients percentage    0.978


204it [00:11, 20.16it/s]

| epoch   3 |   200/  891 batches | accuracy    0.892 | loss    0.351 | zero gradients percentage    0.978


214it [00:12, 19.32it/s]

| epoch   3 |   210/  891 batches | accuracy    0.897 | loss    0.316 | zero gradients percentage    0.978


224it [00:12, 18.69it/s]

| epoch   3 |   220/  891 batches | accuracy    0.891 | loss    0.352 | zero gradients percentage    0.978


234it [00:13, 18.53it/s]

| epoch   3 |   230/  891 batches | accuracy    0.875 | loss    0.410 | zero gradients percentage    0.978


243it [00:13, 18.98it/s]

| epoch   3 |   240/  891 batches | accuracy    0.900 | loss    0.314 | zero gradients percentage    0.978


253it [00:14, 19.07it/s]

| epoch   3 |   250/  891 batches | accuracy    0.908 | loss    0.303 | zero gradients percentage    0.978


262it [00:14, 16.74it/s]

| epoch   3 |   260/  891 batches | accuracy    0.896 | loss    0.341 | zero gradients percentage    0.978


274it [00:15, 14.76it/s]

| epoch   3 |   270/  891 batches | accuracy    0.891 | loss    0.353 | zero gradients percentage    0.978


282it [00:16, 13.45it/s]

| epoch   3 |   280/  891 batches | accuracy    0.878 | loss    0.351 | zero gradients percentage    0.978


292it [00:17, 13.48it/s]

| epoch   3 |   290/  891 batches | accuracy    0.884 | loss    0.373 | zero gradients percentage    0.978


302it [00:17, 13.51it/s]

| epoch   3 |   300/  891 batches | accuracy    0.888 | loss    0.374 | zero gradients percentage    0.978


312it [00:18, 14.08it/s]

| epoch   3 |   310/  891 batches | accuracy    0.892 | loss    0.392 | zero gradients percentage    0.978


324it [00:19, 14.58it/s]

| epoch   3 |   320/  891 batches | accuracy    0.897 | loss    0.305 | zero gradients percentage    0.978


335it [00:20, 18.16it/s]

| epoch   3 |   330/  891 batches | accuracy    0.903 | loss    0.332 | zero gradients percentage    0.978


343it [00:20, 18.21it/s]

| epoch   3 |   340/  891 batches | accuracy    0.883 | loss    0.348 | zero gradients percentage    0.978


354it [00:21, 18.89it/s]

| epoch   3 |   350/  891 batches | accuracy    0.868 | loss    0.388 | zero gradients percentage    0.978


364it [00:21, 18.11it/s]

| epoch   3 |   360/  891 batches | accuracy    0.885 | loss    0.393 | zero gradients percentage    0.978


374it [00:22, 19.37it/s]

| epoch   3 |   370/  891 batches | accuracy    0.896 | loss    0.321 | zero gradients percentage    0.978


384it [00:22, 18.52it/s]

| epoch   3 |   380/  891 batches | accuracy    0.884 | loss    0.390 | zero gradients percentage    0.978


393it [00:23, 19.12it/s]

| epoch   3 |   390/  891 batches | accuracy    0.901 | loss    0.315 | zero gradients percentage    0.978


403it [00:23, 18.42it/s]

| epoch   3 |   400/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.978


415it [00:24, 19.48it/s]

| epoch   3 |   410/  891 batches | accuracy    0.902 | loss    0.322 | zero gradients percentage    0.978


424it [00:24, 19.28it/s]

| epoch   3 |   420/  891 batches | accuracy    0.884 | loss    0.379 | zero gradients percentage    0.978


435it [00:25, 19.51it/s]

| epoch   3 |   430/  891 batches | accuracy    0.866 | loss    0.385 | zero gradients percentage    0.978


444it [00:25, 19.19it/s]

| epoch   3 |   440/  891 batches | accuracy    0.911 | loss    0.293 | zero gradients percentage    0.978


453it [00:26, 19.26it/s]

| epoch   3 |   450/  891 batches | accuracy    0.910 | loss    0.315 | zero gradients percentage    0.978


464it [00:26, 18.67it/s]

| epoch   3 |   460/  891 batches | accuracy    0.891 | loss    0.346 | zero gradients percentage    0.978


473it [00:27, 18.72it/s]

| epoch   3 |   470/  891 batches | accuracy    0.901 | loss    0.316 | zero gradients percentage    0.978


484it [00:27, 19.81it/s]

| epoch   3 |   480/  891 batches | accuracy    0.887 | loss    0.377 | zero gradients percentage    0.978


494it [00:28, 19.98it/s]

| epoch   3 |   490/  891 batches | accuracy    0.899 | loss    0.320 | zero gradients percentage    0.978


504it [00:28, 20.27it/s]

| epoch   3 |   500/  891 batches | accuracy    0.889 | loss    0.340 | zero gradients percentage    0.978


513it [00:29, 20.30it/s]

| epoch   3 |   510/  891 batches | accuracy    0.883 | loss    0.361 | zero gradients percentage    0.978


524it [00:30, 16.38it/s]

| epoch   3 |   520/  891 batches | accuracy    0.891 | loss    0.345 | zero gradients percentage    0.978


534it [00:30, 14.13it/s]

| epoch   3 |   530/  891 batches | accuracy    0.893 | loss    0.316 | zero gradients percentage    0.978


542it [00:31, 13.45it/s]

| epoch   3 |   540/  891 batches | accuracy    0.893 | loss    0.326 | zero gradients percentage    0.978


552it [00:32, 13.84it/s]

| epoch   3 |   550/  891 batches | accuracy    0.899 | loss    0.334 | zero gradients percentage    0.978


562it [00:32, 13.22it/s]

| epoch   3 |   560/  891 batches | accuracy    0.888 | loss    0.351 | zero gradients percentage    0.978


572it [00:33, 13.60it/s]

| epoch   3 |   570/  891 batches | accuracy    0.887 | loss    0.325 | zero gradients percentage    0.978


582it [00:34, 13.44it/s]

| epoch   3 |   580/  891 batches | accuracy    0.898 | loss    0.345 | zero gradients percentage    0.978


593it [00:35, 16.60it/s]

| epoch   3 |   590/  891 batches | accuracy    0.887 | loss    0.385 | zero gradients percentage    0.978


603it [00:35, 18.74it/s]

| epoch   3 |   600/  891 batches | accuracy    0.887 | loss    0.367 | zero gradients percentage    0.978


614it [00:36, 18.25it/s]

| epoch   3 |   610/  891 batches | accuracy    0.881 | loss    0.353 | zero gradients percentage    0.978


623it [00:36, 19.02it/s]

| epoch   3 |   620/  891 batches | accuracy    0.911 | loss    0.332 | zero gradients percentage    0.978


634it [00:37, 19.02it/s]

| epoch   3 |   630/  891 batches | accuracy    0.893 | loss    0.303 | zero gradients percentage    0.978


644it [00:37, 19.08it/s]

| epoch   3 |   640/  891 batches | accuracy    0.885 | loss    0.384 | zero gradients percentage    0.978


653it [00:38, 18.49it/s]

| epoch   3 |   650/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


665it [00:38, 19.79it/s]

| epoch   3 |   660/  891 batches | accuracy    0.880 | loss    0.348 | zero gradients percentage    0.978


674it [00:39, 18.74it/s]

| epoch   3 |   670/  891 batches | accuracy    0.894 | loss    0.327 | zero gradients percentage    0.978


684it [00:39, 19.94it/s]

| epoch   3 |   680/  891 batches | accuracy    0.898 | loss    0.329 | zero gradients percentage    0.978


693it [00:40, 19.29it/s]

| epoch   3 |   690/  891 batches | accuracy    0.905 | loss    0.295 | zero gradients percentage    0.978


704it [00:40, 19.74it/s]

| epoch   3 |   700/  891 batches | accuracy    0.892 | loss    0.340 | zero gradients percentage    0.978


714it [00:41, 19.76it/s]

| epoch   3 |   710/  891 batches | accuracy    0.899 | loss    0.300 | zero gradients percentage    0.978


724it [00:41, 18.47it/s]

| epoch   3 |   720/  891 batches | accuracy    0.895 | loss    0.324 | zero gradients percentage    0.978


734it [00:42, 17.95it/s]

| epoch   3 |   730/  891 batches | accuracy    0.898 | loss    0.316 | zero gradients percentage    0.978


744it [00:42, 20.01it/s]

| epoch   3 |   740/  891 batches | accuracy    0.890 | loss    0.330 | zero gradients percentage    0.978


753it [00:43, 19.04it/s]

| epoch   3 |   750/  891 batches | accuracy    0.895 | loss    0.339 | zero gradients percentage    0.978


765it [00:44, 19.30it/s]

| epoch   3 |   760/  891 batches | accuracy    0.882 | loss    0.371 | zero gradients percentage    0.978


774it [00:44, 19.59it/s]

| epoch   3 |   770/  891 batches | accuracy    0.886 | loss    0.424 | zero gradients percentage    0.978


784it [00:45, 15.97it/s]

| epoch   3 |   780/  891 batches | accuracy    0.903 | loss    0.304 | zero gradients percentage    0.978


792it [00:45, 14.02it/s]

| epoch   3 |   790/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.978


802it [00:46, 13.45it/s]

| epoch   3 |   800/  891 batches | accuracy    0.893 | loss    0.346 | zero gradients percentage    0.978


814it [00:47, 14.89it/s]

| epoch   3 |   810/  891 batches | accuracy    0.898 | loss    0.314 | zero gradients percentage    0.978


822it [00:47, 13.36it/s]

| epoch   3 |   820/  891 batches | accuracy    0.884 | loss    0.350 | zero gradients percentage    0.978


832it [00:48, 13.95it/s]

| epoch   3 |   830/  891 batches | accuracy    0.878 | loss    0.381 | zero gradients percentage    0.978


842it [00:49, 13.52it/s]

| epoch   3 |   840/  891 batches | accuracy    0.909 | loss    0.317 | zero gradients percentage    0.978


854it [00:50, 14.78it/s]

| epoch   3 |   850/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


863it [00:50, 16.92it/s]

| epoch   3 |   860/  891 batches | accuracy    0.905 | loss    0.317 | zero gradients percentage    0.978


874it [00:51, 18.72it/s]

| epoch   3 |   870/  891 batches | accuracy    0.890 | loss    0.379 | zero gradients percentage    0.978


883it [00:51, 18.86it/s]

| epoch   3 |   880/  891 batches | accuracy    0.887 | loss    0.332 | zero gradients percentage    0.978


891it [00:52, 17.09it/s]


| epoch   3 |   890/  891 batches | accuracy    0.885 | loss    0.381 | zero gradients percentage    0.978
-----------------------------------------------------------
| end of epoch   3 | time: 52.83s | valid accuracy    0.892 
-----------------------------------------------------------


13it [00:00, 19.81it/s]

| epoch   4 |    10/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.978


24it [00:01, 19.20it/s]

| epoch   4 |    20/  891 batches | accuracy    0.906 | loss    0.312 | zero gradients percentage    0.978


35it [00:01, 19.59it/s]

| epoch   4 |    30/  891 batches | accuracy    0.899 | loss    0.318 | zero gradients percentage    0.978


44it [00:02, 19.93it/s]

| epoch   4 |    40/  891 batches | accuracy    0.875 | loss    0.385 | zero gradients percentage    0.978


54it [00:02, 18.51it/s]

| epoch   4 |    50/  891 batches | accuracy    0.889 | loss    0.348 | zero gradients percentage    0.978


64it [00:03, 18.12it/s]

| epoch   4 |    60/  891 batches | accuracy    0.888 | loss    0.344 | zero gradients percentage    0.978


73it [00:03, 18.46it/s]

| epoch   4 |    70/  891 batches | accuracy    0.895 | loss    0.352 | zero gradients percentage    0.978


84it [00:04, 19.01it/s]

| epoch   4 |    80/  891 batches | accuracy    0.881 | loss    0.374 | zero gradients percentage    0.978


94it [00:04, 18.88it/s]

| epoch   4 |    90/  891 batches | accuracy    0.887 | loss    0.329 | zero gradients percentage    0.978


103it [00:05, 19.44it/s]

| epoch   4 |   100/  891 batches | accuracy    0.877 | loss    0.390 | zero gradients percentage    0.978


113it [00:05, 18.36it/s]

| epoch   4 |   110/  891 batches | accuracy    0.885 | loss    0.361 | zero gradients percentage    0.978


123it [00:06, 18.91it/s]

| epoch   4 |   120/  891 batches | accuracy    0.890 | loss    0.372 | zero gradients percentage    0.978


135it [00:07, 19.87it/s]

| epoch   4 |   130/  891 batches | accuracy    0.886 | loss    0.342 | zero gradients percentage    0.978


143it [00:07, 16.83it/s]

| epoch   4 |   140/  891 batches | accuracy    0.891 | loss    0.354 | zero gradients percentage    0.978


153it [00:08, 15.57it/s]

| epoch   4 |   150/  891 batches | accuracy    0.896 | loss    0.350 | zero gradients percentage    0.978


163it [00:08, 15.07it/s]

| epoch   4 |   160/  891 batches | accuracy    0.878 | loss    0.391 | zero gradients percentage    0.978


173it [00:09, 15.20it/s]

| epoch   4 |   170/  891 batches | accuracy    0.883 | loss    0.357 | zero gradients percentage    0.978


183it [00:10, 14.19it/s]

| epoch   4 |   180/  891 batches | accuracy    0.892 | loss    0.375 | zero gradients percentage    0.978


193it [00:11, 14.02it/s]

| epoch   4 |   190/  891 batches | accuracy    0.902 | loss    0.313 | zero gradients percentage    0.978


203it [00:11, 14.45it/s]

| epoch   4 |   200/  891 batches | accuracy    0.892 | loss    0.351 | zero gradients percentage    0.978


213it [00:12, 15.45it/s]

| epoch   4 |   210/  891 batches | accuracy    0.897 | loss    0.316 | zero gradients percentage    0.978


225it [00:13, 19.25it/s]

| epoch   4 |   220/  891 batches | accuracy    0.891 | loss    0.352 | zero gradients percentage    0.978


233it [00:13, 20.81it/s]

| epoch   4 |   230/  891 batches | accuracy    0.875 | loss    0.410 | zero gradients percentage    0.978


245it [00:13, 20.78it/s]

| epoch   4 |   240/  891 batches | accuracy    0.900 | loss    0.314 | zero gradients percentage    0.978


254it [00:14, 20.57it/s]

| epoch   4 |   250/  891 batches | accuracy    0.908 | loss    0.303 | zero gradients percentage    0.978


263it [00:14, 20.42it/s]

| epoch   4 |   260/  891 batches | accuracy    0.896 | loss    0.341 | zero gradients percentage    0.978


274it [00:15, 19.93it/s]

| epoch   4 |   270/  891 batches | accuracy    0.891 | loss    0.353 | zero gradients percentage    0.978


283it [00:15, 19.31it/s]

| epoch   4 |   280/  891 batches | accuracy    0.878 | loss    0.351 | zero gradients percentage    0.978


293it [00:16, 19.12it/s]

| epoch   4 |   290/  891 batches | accuracy    0.884 | loss    0.373 | zero gradients percentage    0.978


303it [00:16, 19.67it/s]

| epoch   4 |   300/  891 batches | accuracy    0.888 | loss    0.374 | zero gradients percentage    0.978


315it [00:17, 19.83it/s]

| epoch   4 |   310/  891 batches | accuracy    0.892 | loss    0.392 | zero gradients percentage    0.978


323it [00:17, 19.42it/s]

| epoch   4 |   320/  891 batches | accuracy    0.897 | loss    0.305 | zero gradients percentage    0.978


334it [00:18, 19.18it/s]

| epoch   4 |   330/  891 batches | accuracy    0.903 | loss    0.332 | zero gradients percentage    0.978


344it [00:19, 18.36it/s]

| epoch   4 |   340/  891 batches | accuracy    0.883 | loss    0.348 | zero gradients percentage    0.978


353it [00:19, 18.28it/s]

| epoch   4 |   350/  891 batches | accuracy    0.868 | loss    0.388 | zero gradients percentage    0.978


364it [00:20, 19.08it/s]

| epoch   4 |   360/  891 batches | accuracy    0.885 | loss    0.393 | zero gradients percentage    0.978


373it [00:20, 19.12it/s]

| epoch   4 |   370/  891 batches | accuracy    0.896 | loss    0.321 | zero gradients percentage    0.978


383it [00:21, 18.63it/s]

| epoch   4 |   380/  891 batches | accuracy    0.884 | loss    0.390 | zero gradients percentage    0.978


394it [00:21, 18.79it/s]

| epoch   4 |   390/  891 batches | accuracy    0.901 | loss    0.315 | zero gradients percentage    0.978


404it [00:22, 18.13it/s]

| epoch   4 |   400/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.978


412it [00:22, 14.84it/s]

| epoch   4 |   410/  891 batches | accuracy    0.902 | loss    0.322 | zero gradients percentage    0.978


424it [00:23, 15.24it/s]

| epoch   4 |   420/  891 batches | accuracy    0.884 | loss    0.379 | zero gradients percentage    0.978


432it [00:24, 14.50it/s]

| epoch   4 |   430/  891 batches | accuracy    0.866 | loss    0.385 | zero gradients percentage    0.978


442it [00:24, 14.23it/s]

| epoch   4 |   440/  891 batches | accuracy    0.911 | loss    0.293 | zero gradients percentage    0.978


452it [00:25, 12.06it/s]

| epoch   4 |   450/  891 batches | accuracy    0.910 | loss    0.315 | zero gradients percentage    0.978


462it [00:26, 13.14it/s]

| epoch   4 |   460/  891 batches | accuracy    0.891 | loss    0.346 | zero gradients percentage    0.978


472it [00:27, 13.39it/s]

| epoch   4 |   470/  891 batches | accuracy    0.901 | loss    0.316 | zero gradients percentage    0.978


483it [00:27, 17.22it/s]

| epoch   4 |   480/  891 batches | accuracy    0.887 | loss    0.377 | zero gradients percentage    0.978


494it [00:28, 18.81it/s]

| epoch   4 |   490/  891 batches | accuracy    0.899 | loss    0.320 | zero gradients percentage    0.978


503it [00:28, 18.58it/s]

| epoch   4 |   500/  891 batches | accuracy    0.889 | loss    0.340 | zero gradients percentage    0.978


514it [00:29, 18.02it/s]

| epoch   4 |   510/  891 batches | accuracy    0.883 | loss    0.361 | zero gradients percentage    0.978


524it [00:30, 18.45it/s]

| epoch   4 |   520/  891 batches | accuracy    0.891 | loss    0.345 | zero gradients percentage    0.978


534it [00:30, 19.29it/s]

| epoch   4 |   530/  891 batches | accuracy    0.893 | loss    0.316 | zero gradients percentage    0.978


544it [00:31, 18.18it/s]

| epoch   4 |   540/  891 batches | accuracy    0.893 | loss    0.326 | zero gradients percentage    0.978


553it [00:31, 18.59it/s]

| epoch   4 |   550/  891 batches | accuracy    0.899 | loss    0.334 | zero gradients percentage    0.978


564it [00:32, 19.93it/s]

| epoch   4 |   560/  891 batches | accuracy    0.888 | loss    0.351 | zero gradients percentage    0.978


574it [00:32, 18.66it/s]

| epoch   4 |   570/  891 batches | accuracy    0.887 | loss    0.325 | zero gradients percentage    0.978


584it [00:33, 18.76it/s]

| epoch   4 |   580/  891 batches | accuracy    0.898 | loss    0.345 | zero gradients percentage    0.978


593it [00:33, 18.75it/s]

| epoch   4 |   590/  891 batches | accuracy    0.887 | loss    0.385 | zero gradients percentage    0.978


604it [00:34, 19.03it/s]

| epoch   4 |   600/  891 batches | accuracy    0.887 | loss    0.367 | zero gradients percentage    0.978


614it [00:34, 17.97it/s]

| epoch   4 |   610/  891 batches | accuracy    0.881 | loss    0.353 | zero gradients percentage    0.978


624it [00:35, 18.19it/s]

| epoch   4 |   620/  891 batches | accuracy    0.911 | loss    0.332 | zero gradients percentage    0.978


634it [00:35, 18.20it/s]

| epoch   4 |   630/  891 batches | accuracy    0.893 | loss    0.303 | zero gradients percentage    0.978


643it [00:36, 18.52it/s]

| epoch   4 |   640/  891 batches | accuracy    0.885 | loss    0.384 | zero gradients percentage    0.978


653it [00:36, 17.80it/s]

| epoch   4 |   650/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


664it [00:37, 16.11it/s]

| epoch   4 |   660/  891 batches | accuracy    0.880 | loss    0.348 | zero gradients percentage    0.978


674it [00:38, 15.35it/s]

| epoch   4 |   670/  891 batches | accuracy    0.894 | loss    0.327 | zero gradients percentage    0.978


682it [00:38, 13.35it/s]

| epoch   4 |   680/  891 batches | accuracy    0.898 | loss    0.329 | zero gradients percentage    0.978


692it [00:39, 13.53it/s]

| epoch   4 |   690/  891 batches | accuracy    0.905 | loss    0.295 | zero gradients percentage    0.978


702it [00:40, 12.46it/s]

| epoch   4 |   700/  891 batches | accuracy    0.892 | loss    0.340 | zero gradients percentage    0.978


714it [00:41, 13.68it/s]

| epoch   4 |   710/  891 batches | accuracy    0.899 | loss    0.300 | zero gradients percentage    0.978


722it [00:41, 14.30it/s]

| epoch   4 |   720/  891 batches | accuracy    0.895 | loss    0.324 | zero gradients percentage    0.978


735it [00:42, 17.93it/s]

| epoch   4 |   730/  891 batches | accuracy    0.898 | loss    0.316 | zero gradients percentage    0.978


743it [00:43, 18.37it/s]

| epoch   4 |   740/  891 batches | accuracy    0.890 | loss    0.330 | zero gradients percentage    0.978


753it [00:43, 18.59it/s]

| epoch   4 |   750/  891 batches | accuracy    0.895 | loss    0.339 | zero gradients percentage    0.978


763it [00:44, 18.41it/s]

| epoch   4 |   760/  891 batches | accuracy    0.882 | loss    0.371 | zero gradients percentage    0.978


774it [00:44, 18.92it/s]

| epoch   4 |   770/  891 batches | accuracy    0.886 | loss    0.424 | zero gradients percentage    0.978


783it [00:45, 19.02it/s]

| epoch   4 |   780/  891 batches | accuracy    0.903 | loss    0.304 | zero gradients percentage    0.978


794it [00:45, 18.73it/s]

| epoch   4 |   790/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.978


804it [00:46, 18.34it/s]

| epoch   4 |   800/  891 batches | accuracy    0.893 | loss    0.346 | zero gradients percentage    0.978


813it [00:46, 19.11it/s]

| epoch   4 |   810/  891 batches | accuracy    0.898 | loss    0.314 | zero gradients percentage    0.978


823it [00:47, 19.12it/s]

| epoch   4 |   820/  891 batches | accuracy    0.884 | loss    0.350 | zero gradients percentage    0.978


832it [00:47, 18.25it/s]

| epoch   4 |   830/  891 batches | accuracy    0.878 | loss    0.381 | zero gradients percentage    0.978


844it [00:48, 18.92it/s]

| epoch   4 |   840/  891 batches | accuracy    0.909 | loss    0.317 | zero gradients percentage    0.978


854it [00:48, 18.45it/s]

| epoch   4 |   850/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


863it [00:49, 18.52it/s]

| epoch   4 |   860/  891 batches | accuracy    0.905 | loss    0.317 | zero gradients percentage    0.978


873it [00:49, 18.16it/s]

| epoch   4 |   870/  891 batches | accuracy    0.890 | loss    0.379 | zero gradients percentage    0.978


883it [00:50, 18.42it/s]

| epoch   4 |   880/  891 batches | accuracy    0.887 | loss    0.332 | zero gradients percentage    0.978


891it [00:50, 17.49it/s]


| epoch   4 |   890/  891 batches | accuracy    0.885 | loss    0.381 | zero gradients percentage    0.978
-----------------------------------------------------------
| end of epoch   4 | time: 51.62s | valid accuracy    0.892 
-----------------------------------------------------------


14it [00:00, 17.44it/s]

| epoch   5 |    10/  891 batches | accuracy    0.884 | loss    0.355 | zero gradients percentage    0.978


22it [00:01, 14.52it/s]

| epoch   5 |    20/  891 batches | accuracy    0.906 | loss    0.312 | zero gradients percentage    0.978


34it [00:02, 14.30it/s]

| epoch   5 |    30/  891 batches | accuracy    0.899 | loss    0.318 | zero gradients percentage    0.978


42it [00:02, 12.55it/s]

| epoch   5 |    40/  891 batches | accuracy    0.875 | loss    0.385 | zero gradients percentage    0.978


52it [00:03, 12.70it/s]

| epoch   5 |    50/  891 batches | accuracy    0.889 | loss    0.348 | zero gradients percentage    0.978


62it [00:04, 14.10it/s]

| epoch   5 |    60/  891 batches | accuracy    0.888 | loss    0.344 | zero gradients percentage    0.978


72it [00:05, 13.21it/s]

| epoch   5 |    70/  891 batches | accuracy    0.895 | loss    0.352 | zero gradients percentage    0.978


84it [00:05, 17.38it/s]

| epoch   5 |    80/  891 batches | accuracy    0.881 | loss    0.374 | zero gradients percentage    0.978


93it [00:06, 18.89it/s]

| epoch   5 |    90/  891 batches | accuracy    0.887 | loss    0.329 | zero gradients percentage    0.978


103it [00:06, 18.81it/s]

| epoch   5 |   100/  891 batches | accuracy    0.877 | loss    0.390 | zero gradients percentage    0.978


113it [00:07, 18.53it/s]

| epoch   5 |   110/  891 batches | accuracy    0.885 | loss    0.361 | zero gradients percentage    0.978


124it [00:07, 19.32it/s]

| epoch   5 |   120/  891 batches | accuracy    0.890 | loss    0.372 | zero gradients percentage    0.978


133it [00:08, 19.01it/s]

| epoch   5 |   130/  891 batches | accuracy    0.886 | loss    0.342 | zero gradients percentage    0.978


144it [00:08, 19.58it/s]

| epoch   5 |   140/  891 batches | accuracy    0.891 | loss    0.354 | zero gradients percentage    0.978


154it [00:09, 18.29it/s]

| epoch   5 |   150/  891 batches | accuracy    0.896 | loss    0.350 | zero gradients percentage    0.978


164it [00:10, 18.21it/s]

| epoch   5 |   160/  891 batches | accuracy    0.878 | loss    0.391 | zero gradients percentage    0.978


174it [00:10, 18.87it/s]

| epoch   5 |   170/  891 batches | accuracy    0.883 | loss    0.357 | zero gradients percentage    0.978


184it [00:11, 17.58it/s]

| epoch   5 |   180/  891 batches | accuracy    0.892 | loss    0.375 | zero gradients percentage    0.978


195it [00:11, 19.29it/s]

| epoch   5 |   190/  891 batches | accuracy    0.902 | loss    0.313 | zero gradients percentage    0.978


203it [00:12, 18.94it/s]

| epoch   5 |   200/  891 batches | accuracy    0.892 | loss    0.351 | zero gradients percentage    0.978


215it [00:12, 20.05it/s]

| epoch   5 |   210/  891 batches | accuracy    0.897 | loss    0.316 | zero gradients percentage    0.978


223it [00:13, 19.13it/s]

| epoch   5 |   220/  891 batches | accuracy    0.891 | loss    0.352 | zero gradients percentage    0.978


233it [00:13, 19.35it/s]

| epoch   5 |   230/  891 batches | accuracy    0.875 | loss    0.410 | zero gradients percentage    0.978


243it [00:14, 18.63it/s]

| epoch   5 |   240/  891 batches | accuracy    0.900 | loss    0.314 | zero gradients percentage    0.978


254it [00:14, 18.48it/s]

| epoch   5 |   250/  891 batches | accuracy    0.908 | loss    0.303 | zero gradients percentage    0.978


264it [00:15, 18.37it/s]

| epoch   5 |   260/  891 batches | accuracy    0.896 | loss    0.341 | zero gradients percentage    0.978


274it [00:16, 14.86it/s]

| epoch   5 |   270/  891 batches | accuracy    0.891 | loss    0.353 | zero gradients percentage    0.978


282it [00:16, 13.57it/s]

| epoch   5 |   280/  891 batches | accuracy    0.878 | loss    0.351 | zero gradients percentage    0.978


292it [00:17, 13.09it/s]

| epoch   5 |   290/  891 batches | accuracy    0.884 | loss    0.373 | zero gradients percentage    0.978


302it [00:18, 12.06it/s]

| epoch   5 |   300/  891 batches | accuracy    0.888 | loss    0.374 | zero gradients percentage    0.978


312it [00:19, 13.55it/s]

| epoch   5 |   310/  891 batches | accuracy    0.892 | loss    0.392 | zero gradients percentage    0.978


322it [00:19, 13.85it/s]

| epoch   5 |   320/  891 batches | accuracy    0.897 | loss    0.305 | zero gradients percentage    0.978


334it [00:20, 15.50it/s]

| epoch   5 |   330/  891 batches | accuracy    0.903 | loss    0.332 | zero gradients percentage    0.978


344it [00:21, 17.51it/s]

| epoch   5 |   340/  891 batches | accuracy    0.883 | loss    0.348 | zero gradients percentage    0.978


354it [00:21, 17.45it/s]

| epoch   5 |   350/  891 batches | accuracy    0.868 | loss    0.388 | zero gradients percentage    0.978


364it [00:22, 17.78it/s]

| epoch   5 |   360/  891 batches | accuracy    0.885 | loss    0.393 | zero gradients percentage    0.978


373it [00:22, 18.73it/s]

| epoch   5 |   370/  891 batches | accuracy    0.896 | loss    0.321 | zero gradients percentage    0.978


384it [00:23, 18.82it/s]

| epoch   5 |   380/  891 batches | accuracy    0.884 | loss    0.390 | zero gradients percentage    0.978


395it [00:23, 19.33it/s]

| epoch   5 |   390/  891 batches | accuracy    0.901 | loss    0.315 | zero gradients percentage    0.978


403it [00:24, 19.19it/s]

| epoch   5 |   400/  891 batches | accuracy    0.884 | loss    0.357 | zero gradients percentage    0.978


414it [00:24, 18.83it/s]

| epoch   5 |   410/  891 batches | accuracy    0.902 | loss    0.322 | zero gradients percentage    0.978


424it [00:25, 18.48it/s]

| epoch   5 |   420/  891 batches | accuracy    0.884 | loss    0.379 | zero gradients percentage    0.978


435it [00:26, 19.20it/s]

| epoch   5 |   430/  891 batches | accuracy    0.866 | loss    0.385 | zero gradients percentage    0.978


443it [00:26, 18.94it/s]

| epoch   5 |   440/  891 batches | accuracy    0.911 | loss    0.293 | zero gradients percentage    0.978


453it [00:27, 18.03it/s]

| epoch   5 |   450/  891 batches | accuracy    0.910 | loss    0.315 | zero gradients percentage    0.978


463it [00:27, 18.59it/s]

| epoch   5 |   460/  891 batches | accuracy    0.891 | loss    0.346 | zero gradients percentage    0.978


474it [00:28, 18.98it/s]

| epoch   5 |   470/  891 batches | accuracy    0.901 | loss    0.316 | zero gradients percentage    0.978


484it [00:28, 18.63it/s]

| epoch   5 |   480/  891 batches | accuracy    0.887 | loss    0.377 | zero gradients percentage    0.978


495it [00:29, 19.31it/s]

| epoch   5 |   490/  891 batches | accuracy    0.899 | loss    0.320 | zero gradients percentage    0.978


503it [00:29, 18.15it/s]

| epoch   5 |   500/  891 batches | accuracy    0.889 | loss    0.340 | zero gradients percentage    0.978


514it [00:30, 18.60it/s]

| epoch   5 |   510/  891 batches | accuracy    0.883 | loss    0.361 | zero gradients percentage    0.978


522it [00:30, 14.79it/s]

| epoch   5 |   520/  891 batches | accuracy    0.891 | loss    0.345 | zero gradients percentage    0.978


532it [00:31, 13.01it/s]

| epoch   5 |   530/  891 batches | accuracy    0.893 | loss    0.316 | zero gradients percentage    0.978


542it [00:32, 13.26it/s]

| epoch   5 |   540/  891 batches | accuracy    0.893 | loss    0.326 | zero gradients percentage    0.978


552it [00:33, 12.66it/s]

| epoch   5 |   550/  891 batches | accuracy    0.899 | loss    0.334 | zero gradients percentage    0.978


562it [00:33, 14.32it/s]

| epoch   5 |   560/  891 batches | accuracy    0.888 | loss    0.351 | zero gradients percentage    0.978


572it [00:34, 12.96it/s]

| epoch   5 |   570/  891 batches | accuracy    0.887 | loss    0.325 | zero gradients percentage    0.978


584it [00:35, 14.01it/s]

| epoch   5 |   580/  891 batches | accuracy    0.898 | loss    0.345 | zero gradients percentage    0.978


594it [00:36, 15.84it/s]

| epoch   5 |   590/  891 batches | accuracy    0.887 | loss    0.385 | zero gradients percentage    0.978


603it [00:36, 18.58it/s]

| epoch   5 |   600/  891 batches | accuracy    0.887 | loss    0.367 | zero gradients percentage    0.978


613it [00:37, 18.02it/s]

| epoch   5 |   610/  891 batches | accuracy    0.881 | loss    0.353 | zero gradients percentage    0.978


624it [00:37, 20.53it/s]

| epoch   5 |   620/  891 batches | accuracy    0.911 | loss    0.332 | zero gradients percentage    0.978


633it [00:38, 19.05it/s]

| epoch   5 |   630/  891 batches | accuracy    0.893 | loss    0.303 | zero gradients percentage    0.978


644it [00:38, 19.09it/s]

| epoch   5 |   640/  891 batches | accuracy    0.885 | loss    0.384 | zero gradients percentage    0.978


652it [00:39, 17.60it/s]

| epoch   5 |   650/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


663it [00:39, 19.02it/s]

| epoch   5 |   660/  891 batches | accuracy    0.880 | loss    0.348 | zero gradients percentage    0.978


674it [00:40, 18.18it/s]

| epoch   5 |   670/  891 batches | accuracy    0.894 | loss    0.327 | zero gradients percentage    0.978


684it [00:40, 17.77it/s]

| epoch   5 |   680/  891 batches | accuracy    0.898 | loss    0.329 | zero gradients percentage    0.978


694it [00:41, 17.87it/s]

| epoch   5 |   690/  891 batches | accuracy    0.905 | loss    0.295 | zero gradients percentage    0.978


703it [00:41, 18.83it/s]

| epoch   5 |   700/  891 batches | accuracy    0.892 | loss    0.340 | zero gradients percentage    0.978


714it [00:42, 19.16it/s]

| epoch   5 |   710/  891 batches | accuracy    0.899 | loss    0.300 | zero gradients percentage    0.978


723it [00:42, 18.67it/s]

| epoch   5 |   720/  891 batches | accuracy    0.895 | loss    0.324 | zero gradients percentage    0.978


734it [00:43, 18.78it/s]

| epoch   5 |   730/  891 batches | accuracy    0.898 | loss    0.316 | zero gradients percentage    0.978


744it [00:44, 18.36it/s]

| epoch   5 |   740/  891 batches | accuracy    0.890 | loss    0.330 | zero gradients percentage    0.978


754it [00:44, 17.86it/s]

| epoch   5 |   750/  891 batches | accuracy    0.895 | loss    0.339 | zero gradients percentage    0.978


763it [00:45, 18.35it/s]

| epoch   5 |   760/  891 batches | accuracy    0.882 | loss    0.371 | zero gradients percentage    0.978


773it [00:45, 18.86it/s]

| epoch   5 |   770/  891 batches | accuracy    0.886 | loss    0.424 | zero gradients percentage    0.978


783it [00:46, 14.92it/s]

| epoch   5 |   780/  891 batches | accuracy    0.903 | loss    0.304 | zero gradients percentage    0.978


793it [00:47, 13.89it/s]

| epoch   5 |   790/  891 batches | accuracy    0.878 | loss    0.377 | zero gradients percentage    0.978


803it [00:47, 14.45it/s]

| epoch   5 |   800/  891 batches | accuracy    0.893 | loss    0.346 | zero gradients percentage    0.978


813it [00:48, 13.66it/s]

| epoch   5 |   810/  891 batches | accuracy    0.898 | loss    0.314 | zero gradients percentage    0.978


823it [00:49, 14.86it/s]

| epoch   5 |   820/  891 batches | accuracy    0.884 | loss    0.350 | zero gradients percentage    0.978


833it [00:49, 13.93it/s]

| epoch   5 |   830/  891 batches | accuracy    0.878 | loss    0.381 | zero gradients percentage    0.978


843it [00:50, 13.46it/s]

| epoch   5 |   840/  891 batches | accuracy    0.909 | loss    0.317 | zero gradients percentage    0.978


853it [00:51, 13.87it/s]

| epoch   5 |   850/  891 batches | accuracy    0.892 | loss    0.354 | zero gradients percentage    0.978


865it [00:51, 18.56it/s]

| epoch   5 |   860/  891 batches | accuracy    0.905 | loss    0.317 | zero gradients percentage    0.978


873it [00:52, 18.54it/s]

| epoch   5 |   870/  891 batches | accuracy    0.890 | loss    0.379 | zero gradients percentage    0.978


883it [00:52, 19.40it/s]

| epoch   5 |   880/  891 batches | accuracy    0.887 | loss    0.332 | zero gradients percentage    0.978


891it [00:53, 16.72it/s]


| epoch   5 |   890/  891 batches | accuracy    0.885 | loss    0.381 | zero gradients percentage    0.978
-----------------------------------------------------------
| end of epoch   5 | time: 53.94s | valid accuracy    0.892 
-----------------------------------------------------------
Checking the results of test dataset.
test accuracy    0.889
test loss    0.003
