In [8]:
import torch
import torch.nn as nn
import os
from torch import optim
from sklearn.metrics import confusion_matrix
from sklearn import neighbors
import time
from prettytable import PrettyTable
from adniLoader import *
import glob

torch.cuda.empty_cache()

In [9]:
#@title ViT Implementation 🔥
import math
import torch
from torch import nn


class NewGELUActivation(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415

    Taken from https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py
    """

    def forward(self, input):
        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))


class PatchEmbeddings(nn.Module):
    """
    Convert the image into patches and then project them into a vector space.
    """

    def __init__(self, config):
        super().__init__()
        self.image_size = config["image_size"]
        self.patch_size = config["patch_size"]
        self.num_channels = config["num_channels"]
        self.hidden_size = config["hidden_size"]
        # Calculate the number of patches from the image size and patch size
        self.num_patches = (self.image_size // self.patch_size) ** 2
        # Create a projection layer to convert the image into patches
        # The layer projects each patch into a vector of size hidden_size
        self.projection = nn.Conv2d(self.num_channels, self.hidden_size, kernel_size=self.patch_size, stride=self.patch_size)

    def forward(self, x):
        # (batch_size, num_channels, image_size, image_size) -> (batch_size, num_patches, hidden_size)
        x = self.projection(x)
        x = x.flatten(2).transpose(1, 2)
        return x


class Embeddings(nn.Module):
    """
    Combine the patch embeddings with the class token and position embeddings.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.patch_embeddings = PatchEmbeddings(config)
        # Create a learnable [CLS] token
        # Similar to BERT, the [CLS] token is added to the beginning of the input sequence
        # and is used to classify the entire sequence
        self.cls_token = nn.Parameter(torch.randn(1, 1, config["hidden_size"]))
        # Create position embeddings for the [CLS] token and the patch embeddings
        # Add 1 to the sequence length for the [CLS] token
        self.position_embeddings = \
            nn.Parameter(torch.randn(1, self.patch_embeddings.num_patches + 1, config["hidden_size"]))
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])

    def forward(self, x):
        x = self.patch_embeddings(x)
        batch_size, _, _ = x.size()
        # Expand the [CLS] token to the batch size
        # (1, 1, hidden_size) -> (batch_size, 1, hidden_size)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        # Concatenate the [CLS] token to the beginning of the input sequence
        # This results in a sequence length of (num_patches + 1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.position_embeddings
        x = self.dropout(x)
        return x


class AttentionHead(nn.Module):
    """
    A single attention head.
    This module is used in the MultiHeadAttention module.

    """
    def __init__(self, hidden_size, attention_head_size, dropout, bias=True):
        super().__init__()
        self.hidden_size = hidden_size
        self.attention_head_size = attention_head_size
        # Create the query, key, and value projection layers
        self.query = nn.Linear(hidden_size, attention_head_size, bias=bias)
        self.key = nn.Linear(hidden_size, attention_head_size, bias=bias)
        self.value = nn.Linear(hidden_size, attention_head_size, bias=bias)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Project the input into query, key, and value
        # The same input is used to generate the query, key, and value,
        # so it's usually called self-attention.
        # (batch_size, sequence_length, hidden_size) -> (batch_size, sequence_length, attention_head_size)
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        # Calculate the attention scores
        # softmax(Q*K.T/sqrt(head_size))*V
        attention_scores = torch.matmul(query, key.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)
        # Calculate the attention output
        attention_output = torch.matmul(attention_probs, value)
        return (attention_output, attention_probs)


class MultiHeadAttention(nn.Module):
    """
    Multi-head attention module.
    This module is used in the TransformerEncoder module.
    """

    def __init__(self, config):
        super().__init__()
        self.hidden_size = config["hidden_size"]
        self.num_attention_heads = config["num_attention_heads"]
        # The attention head size is the hidden size divided by the number of attention heads
        self.attention_head_size = self.hidden_size // self.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        # Whether or not to use bias in the query, key, and value projection layers
        self.qkv_bias = config["qkv_bias"]
        # Create a list of attention heads
        self.heads = nn.ModuleList([])
        for _ in range(self.num_attention_heads):
            head = AttentionHead(
                self.hidden_size,
                self.attention_head_size,
                config["attention_probs_dropout_prob"],
                self.qkv_bias
            )
            self.heads.append(head)
        # Create a linear layer to project the attention output back to the hidden size
        # In most cases, all_head_size and hidden_size are the same
        self.output_projection = nn.Linear(self.all_head_size, self.hidden_size)
        self.output_dropout = nn.Dropout(config["hidden_dropout_prob"])

    def forward(self, x, output_attentions=False):
        # Calculate the attention output for each attention head
        attention_outputs = [head(x) for head in self.heads]
        # Concatenate the attention outputs from each attention head
        attention_output = torch.cat([attention_output for attention_output, _ in attention_outputs], dim=-1)
        # Project the concatenated attention output back to the hidden size
        attention_output = self.output_projection(attention_output)
        attention_output = self.output_dropout(attention_output)
        # Return the attention output and the attention probabilities (optional)
        if not output_attentions:
            return (attention_output, None)
        else:
            attention_probs = torch.stack([attention_probs for _, attention_probs in attention_outputs], dim=1)
            return (attention_output, attention_probs)


class FasterMultiHeadAttention(nn.Module):
    """
    Multi-head attention module with some optimizations.
    All the heads are processed simultaneously with merged query, key, and value projections.
    """

    def __init__(self, config):
        super().__init__()
        self.hidden_size = config["hidden_size"]
        self.num_attention_heads = config["num_attention_heads"]
        # The attention head size is the hidden size divided by the number of attention heads
        self.attention_head_size = self.hidden_size // self.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        # Whether or not to use bias in the query, key, and value projection layers
        self.qkv_bias = config["qkv_bias"]
        # Create a linear layer to project the query, key, and value
        self.qkv_projection = nn.Linear(self.hidden_size, self.all_head_size * 3, bias=self.qkv_bias)
        self.attn_dropout = nn.Dropout(config["attention_probs_dropout_prob"])
        # Create a linear layer to project the attention output back to the hidden size
        # In most cases, all_head_size and hidden_size are the same
        self.output_projection = nn.Linear(self.all_head_size, self.hidden_size)
        self.output_dropout = nn.Dropout(config["hidden_dropout_prob"])

    def forward(self, x, output_attentions=False):
        # Project the query, key, and value
        # (batch_size, sequence_length, hidden_size) -> (batch_size, sequence_length, all_head_size * 3)
        qkv = self.qkv_projection(x)
        # Split the projected query, key, and value into query, key, and value
        # (batch_size, sequence_length, all_head_size * 3) -> (batch_size, sequence_length, all_head_size)
        query, key, value = torch.chunk(qkv, 3, dim=-1)
        # Resize the query, key, and value to (batch_size, num_attention_heads, sequence_length, attention_head_size)
        batch_size, sequence_length, _ = query.size()
        query = query.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
        key = key.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
        value = value.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
        # Calculate the attention scores
        # softmax(Q*K.T/sqrt(head_size))*V
        attention_scores = torch.matmul(query, key.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
        attention_probs = self.attn_dropout(attention_probs)
        # Calculate the attention output
        attention_output = torch.matmul(attention_probs, value)
        # Resize the attention output
        # from (batch_size, num_attention_heads, sequence_length, attention_head_size)
        # To (batch_size, sequence_length, all_head_size)
        attention_output = attention_output.transpose(1, 2) \
                                           .contiguous() \
                                           .view(batch_size, sequence_length, self.all_head_size)
        # Project the attention output back to the hidden size
        attention_output = self.output_projection(attention_output)
        attention_output = self.output_dropout(attention_output)
        # Return the attention output and the attention probabilities (optional)
        if not output_attentions:
            return (attention_output, None)
        else:
            return (attention_output, attention_probs)


class MLP(nn.Module):
    """
    A multi-layer perceptron module.
    """

    def __init__(self, config):
        super().__init__()
        self.dense_1 = nn.Linear(config["hidden_size"], config["intermediate_size"])
        self.activation = NewGELUActivation()
        self.dense_2 = nn.Linear(config["intermediate_size"], config["hidden_size"])
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])

    def forward(self, x):
        x = self.dense_1(x)
        x = self.activation(x)
        x = self.dense_2(x)
        x = self.dropout(x)
        return x


class Block(nn.Module):
    """
    A single transformer block.
    """

    def __init__(self, config):
        super().__init__()
        self.use_faster_attention = config.get("use_faster_attention", False)
        if self.use_faster_attention:
            self.attention = FasterMultiHeadAttention(config)
        else:
            self.attention = MultiHeadAttention(config)
        self.layernorm_1 = nn.LayerNorm(config["hidden_size"])
        self.mlp = MLP(config)
        self.layernorm_2 = nn.LayerNorm(config["hidden_size"])

    def forward(self, x, output_attentions=False):
        # Self-attention
        attention_output, attention_probs = \
            self.attention(self.layernorm_1(x), output_attentions=output_attentions)
        # Skip connection
        x = x + attention_output
        # Feed-forward network
        mlp_output = self.mlp(self.layernorm_2(x))
        # Skip connection
        x = x + mlp_output
        # Return the transformer block's output and the attention probabilities (optional)
        if not output_attentions:
            return (x, None)
        else:
            return (x, attention_probs)


class Encoder(nn.Module):
    """
    The transformer encoder module.
    """

    def __init__(self, config):
        super().__init__()
        # Create a list of transformer blocks
        self.blocks = nn.ModuleList([])
        for _ in range(config["num_hidden_layers"]):
            block = Block(config)
            self.blocks.append(block)

    def forward(self, x, output_attentions=False):
        # Calculate the transformer block's output for each block
        all_attentions = []
        for block in self.blocks:
            x, attention_probs = block(x, output_attentions=output_attentions)
            if output_attentions:
                all_attentions.append(attention_probs)
        # Return the encoder's output and the attention probabilities (optional)
        if not output_attentions:
            return (x, None)
        else:
            return (x, all_attentions)


class ViTForClassfication(nn.Module):
    """
    The ViT model for classification.
    """

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.image_size = config["image_size"]
        self.hidden_size = config["hidden_size"]
        self.num_classes = config["num_classes"]
        # Create the embedding module
        self.embedding = Embeddings(config)
        # Create the transformer encoder module
        self.encoder = Encoder(config)
        # Create a linear layer to project the encoder's output to the number of classes
        self.classifier = nn.Linear(self.hidden_size, self.num_classes)
        # Initialize the weights
        self.apply(self._init_weights)

    def forward(self, x, output_attentions=False):
        # Calculate the embedding output
        embedding_output = self.embedding(x)
        # Calculate the encoder's output
        encoder_output, all_attentions = self.encoder(embedding_output, output_attentions=output_attentions)
        # Calculate the logits, take the [CLS] token's output as features for classification
        logits = self.classifier(encoder_output[:, 0, :])
        # Return the logits and the attention probabilities (optional)
        if not output_attentions:
            return (logits, None)
        else:
            return (logits, all_attentions)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=self.config["initializer_range"])
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, Embeddings):
            module.position_embeddings.data = nn.init.trunc_normal_(
                module.position_embeddings.data.to(torch.float32),
                mean=0.0,
                std=self.config["initializer_range"],
            ).to(module.position_embeddings.dtype)

            module.cls_token.data = nn.init.trunc_normal_(
                module.cls_token.data.to(torch.float32),
                mean=0.0,
                std=self.config["initializer_range"],
            ).to(module.cls_token.dtype)

In [10]:
config = {
    'batch_size' : 4,
    'image_size' : 180,
    'patch_size' : 15,
    "hidden_size": 96,
    "num_hidden_layers": 4,
    "num_attention_heads": 6,
    "intermediate_size": 2 * 96, # 4 * hidden_size
    "hidden_dropout_prob": 0.2,
    "attention_probs_dropout_prob": 0.2,
    "initializer_range": 0.02,
    "num_classes": 3, # num_classes
    "num_channels": 1,
    "qkv_bias": True,
    "use_faster_attention": True,
    'lr' : 1e-3
}

In [24]:
#@title Prepare Data 📊
# Import libraries
import torch
import torchvision
import torchvision.transforms as transforms

class FolderDataset(Dataset):
    def __init__(self, folder):
        self.folder = folder
        self.image_paths = glob.glob(f'{self.folder}/*/*.pt')
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        return torch.load(f"{self.folder}/{self.image_paths[idx]}")
    
def prepare_data(batch_size=4, num_workers=2, train_sample_size=None, test_sample_size=None):
    train_dataset = FolderDataset(folder='/home/arindam/Alzheimer/ViT/data/3D/Test')
    val_dataset = FolderDataset(folder='/home/arindam/Alzheimer/ViT/data/3D/Test')
    test_dataset = FolderDataset(folder='/home/arindam/Alzheimer/ViT/data/3D/Test')

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    valid_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

    classes = ('CN', 'MCI', 'AD')
    
    return train_loader, valid_loader, test_loader

In [25]:
#@title Utils 🛠️
import json, os, math
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.nn import functional as F
import torchvision
import torchvision.transforms as transforms


def save_experiment(experiment_name, config, model, train_losses, test_losses, accuracies, base_dir="experiments"):
    outdir = os.path.join(base_dir, experiment_name)
    os.makedirs(outdir, exist_ok=True)

    # Save the config
    configfile = os.path.join(outdir, 'config.json')
    with open(configfile, 'w') as f:
        json.dump(config, f, sort_keys=True, indent=4)

    # Save the metrics
    jsonfile = os.path.join(outdir, 'metrics.json')
    with open(jsonfile, 'w') as f:
        data = {
            'train_losses': train_losses,
            'test_losses': test_losses,
            'accuracies': accuracies,
        }
        json.dump(data, f, sort_keys=True, indent=4)

    # Save the model
    save_checkpoint(experiment_name, model, "final", base_dir=base_dir)


def save_checkpoint(experiment_name, model, epoch, base_dir="experiments"):
    outdir = os.path.join(base_dir, experiment_name)
    os.makedirs(outdir, exist_ok=True)
    cpfile = os.path.join(outdir, f'model_{epoch}.pt')
    torch.save(model.state_dict(), cpfile)


def load_experiment(experiment_name, checkpoint_name="model_final.pt", base_dir="experiments"):
    outdir = os.path.join(base_dir, experiment_name)
    # Load the config
    configfile = os.path.join(outdir, 'config.json')
    with open(configfile, 'r') as f:
        config = json.load(f)
    # Load the metrics
    jsonfile = os.path.join(outdir, 'metrics.json')
    with open(jsonfile, 'r') as f:
        data = json.load(f)
    train_losses = data['train_losses']
    test_losses = data['test_losses']
    accuracies = data['accuracies']
    # Load the model
    model = ViTForClassfication(config)
    cpfile = os.path.join(outdir, checkpoint_name)
    model.load_state_dict(torch.load(cpfile))
    return config, model, train_losses, test_losses, accuracies


def visualize_images(dataset):
    classes = ('CN', 'MCI', 'AD')

    # Pick 4 samples randomly
    indices = torch.randperm(len(dataset))[:4]
    images = [np.asarray(dataset[i][0]) for i in indices]
    labels = [dataset[i][1] for i in indices]
    # Visualize the images using matplotlib
    fig = plt.figure(figsize=(10, 10))
    for i in range(30):
        ax = fig.add_subplot(6, 5, i+1, xticks=[], yticks=[])
        ax.imshow(images[i])
        ax.set_title(classes[labels[i]])

In [26]:
train_loader, valid_loader, test_loader = prepare_data()

In [5]:
print(model)

NameError: name 'model' is not defined

In [6]:
# Get parameters for each layer of the model in a tabular format

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")

count_parameters(model)

+----------------------------------------+------------+
|                Modules                 | Parameters |
+----------------------------------------+------------+
|             pos_embedding              |   921856   |
|               cls_token                |    256     |
|      to_patch_embedding.1.weight       |    500     |
|       to_patch_embedding.1.bias        |    500     |
|      to_patch_embedding.2.weight       |   128000   |
|       to_patch_embedding.2.bias        |    256     |
|      to_patch_embedding.3.weight       |    256     |
|       to_patch_embedding.3.bias        |    256     |
|   transformer.layers.0.0.norm.weight   |    256     |
|    transformer.layers.0.0.norm.bias    |    256     |
|  transformer.layers.0.0.to_qkv.weight  |   196608   |
| transformer.layers.0.0.to_out.0.weight |   65536    |
|  transformer.layers.0.0.to_out.0.bias  |    256     |
|  transformer.layers.0.1.net.0.weight   |    256     |
|   transformer.layers.0.1.net.0.bias    |    25

In [7]:
# Load the datasets
DATA_PATH = os.path.join('/home/arindam/Alzheimer/Data/adni1-complete-3yr-1-5t', 'ADNI')
config = {
    'img_size': 128,
    'depth' : 128,
    'batch_size' : 16
}

# Modify the above config in the Dataloader to change the batch size, image size and depth of the model
train_loader, valid_loader, test_loader = LoadDatasets(return_type='loader')

Total number of images:  2182
Total number of train, validation and test images are 1745, 218 and 219 respectively.


In [8]:
# Check a sample batch size
for data in train_loader:
    images, labels = data
    print(f"Shape of images and labels of a signle batch is {images.shape} and {labels.shape} respectively.")
    break

Shape of images and labels of a signle batch is torch.Size([2, 1, 150, 150, 80]) and torch.Size([2]) respectively.


In [9]:
# Optimizer
optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=1e-3,
            weight_decay=5e-6
            )

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [20, 30], gamma=0.1)
loss_fn = nn.functional.cross_entropy

In [10]:
def train(train_loader):
    total_time_iter = 0
    model.train()
    start = time.time()
    train_loss, n_samples = 0, 0
    correct = 0
    for batch_idx, data in enumerate(train_loader):
        images, labels = data
        images, labels = images.to(device), labels.to(device) #torch.squeeze(labels).to(device)
        optimizer.zero_grad()
        output = model(images)
        loss = loss_fn(output, labels, reduction='mean')
        loss.backward()
        optimizer.step()
        time_iter = time.time() - start
        total_time_iter += time_iter
        train_loss += loss.item() * len(output)
        n_samples += len(output)
        #print(output)
        predicted = torch.argmax(output, dim=1)
        #print(predicted, labels)
        correct += (predicted == labels).sum()
        if batch_idx % 50 == 0 or batch_idx == len(train_loader) - 1:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} (avg: {:.6f}) \tsec/iter: {:.4f} s\t Accuracy (avg) {:.3f}'.format(
                epoch+1, n_samples, len(train_loader.dataset),
                100. * (batch_idx + 1) / len(train_loader), train_loss, train_loss / n_samples, time_iter / (batch_idx + 1), 100*(correct/n_samples) ))
    
    scheduler.step()
    print(f"Took {time.time() - start} sec for this epoch")
    
    return train_loss / n_samples, 100*(correct/n_samples)


def validation(valid_loader):
    model.eval()
    start = time.time()
    valid_loss, correct, n_samples = 0, 0, 0
    for batch_idx, data in enumerate(valid_loader):
        images, labels = data
        images, labels = images.to(device), labels.to(device) #torch.squeeze(labels).to(device)
        optimizer.zero_grad()
        output = model(images)
        loss = loss_fn(output, labels, reduction='sum')
        valid_loss += loss.item()
        n_samples += len(output)
        pred = torch.argmax(output.data, dim=1)

        correct += (pred == labels).sum()

    time_iter = time.time() - start

    #valid_loss /= n_samples

    acc = 100. * correct / n_samples
    print('Validation set (epoch {}): Loss: {:.4f}, Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%) Took {} sec\n'.format(epoch+1, 
                                                                                          valid_loss,
                                                                                          valid_loss / n_samples,
                                                                                          correct, 
                                                                                          n_samples, acc, 
                                                                                          time_iter))
    return valid_loss/n_samples, acc


def test(test_loader):
    print('Test model ...')
    model.eval()
    start = time.time()
    test_loss, correct, n_samples = 0, 0, 0
    for batch_idx, data in enumerate(test_loader):
        images, labels = data
        images, labels = images.to(device), labels.to(device) #torch.squeeze(labels).to(device)
        optimizer.zero_grad()
        output = model(images)
        loss = loss_fn(output, labels, reduction='sum')
        test_loss += loss.item()
        n_samples += len(output)
        pred = torch.argmax(output.data, dim=1)

        correct += (pred == labels).sum()

    time_iter = time.time() - start

    avg_test_loss = test_loss / n_samples

    acc = 100. * (correct / n_samples)
    print('Test set: Loss: {:.4f}, Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%). Took {} sec time'.format(test_loss, 
                                                                                avg_test_loss,
                                                                                correct, 
                                                                                n_samples, acc, time_iter))
    return avg_test_loss, acc

In [None]:
epochs=50 # Number of epochs
# Train the model. Save the model with the best validation accuracy. Only last epoch execution is shown
for epoch in range(epochs):
    train_loss, train_acc = train(train_loader)
    valid_loss, val_acc = validation(valid_loader)
    #wandb.log({"train_loss": train_loss, "train_acc": train_acc, "valid_loss": valid_loss, "val_acc": val_acc})
    
    
# Test the model
test_loss, test_acc = test(test_loader)
#wandb.log({"test_loss": test_loss, "test_acc": test_acc})
# Mark the run as finished
#wandb.finish()

Took 172.5844566822052 sec for this epoch
Validation set (epoch 1): Loss: 229.2335, Average loss: 1.0515, Accuracy: 75/218 (34.40%) Took 17.730172634124756 sec

Took 171.35476088523865 sec for this epoch
Validation set (epoch 2): Loss: 234.3152, Average loss: 1.0748, Accuracy: 102/218 (46.79%) Took 17.407451391220093 sec

Took 171.00751733779907 sec for this epoch
Validation set (epoch 3): Loss: 238.8868, Average loss: 1.0958, Accuracy: 102/218 (46.79%) Took 17.389522790908813 sec



Took 170.93700218200684 sec for this epoch
Validation set (epoch 4): Loss: 229.2018, Average loss: 1.0514, Accuracy: 102/218 (46.79%) Took 17.414711236953735 sec

Took 170.87346267700195 sec for this epoch
Validation set (epoch 5): Loss: 226.9662, Average loss: 1.0411, Accuracy: 102/218 (46.79%) Took 17.508517503738403 sec

Took 170.85209608078003 sec for this epoch
Validation set (epoch 6): Loss: 226.4400, Average loss: 1.0387, Accuracy: 102/218 (46.79%) Took 17.40999460220337 sec

Took 170.75432801246643 sec for this epoch
Validation set (epoch 7): Loss: 226.7366, Average loss: 1.0401, Accuracy: 102/218 (46.79%) Took 17.453594207763672 sec



Took 170.87572288513184 sec for this epoch
Validation set (epoch 8): Loss: 234.8543, Average loss: 1.0773, Accuracy: 102/218 (46.79%) Took 17.45076823234558 sec

Took 170.71589946746826 sec for this epoch
Validation set (epoch 9): Loss: 226.0383, Average loss: 1.0369, Accuracy: 102/218 (46.79%) Took 17.358460187911987 sec

Took 169.7118444442749 sec for this epoch
Validation set (epoch 10): Loss: 228.7719, Average loss: 1.0494, Accuracy: 102/218 (46.79%) Took 17.373456954956055 sec

Took 169.9351065158844 sec for this epoch
Validation set (epoch 11): Loss: 226.2734, Average loss: 1.0380, Accuracy: 102/218 (46.79%) Took 17.271530628204346 sec



Took 167.9944715499878 sec for this epoch
Validation set (epoch 12): Loss: 227.0575, Average loss: 1.0415, Accuracy: 102/218 (46.79%) Took 17.092297315597534 sec

Took 168.14068460464478 sec for this epoch
Validation set (epoch 13): Loss: 226.3860, Average loss: 1.0385, Accuracy: 102/218 (46.79%) Took 17.107948064804077 sec

Took 167.95915150642395 sec for this epoch
Validation set (epoch 14): Loss: 228.6260, Average loss: 1.0487, Accuracy: 102/218 (46.79%) Took 17.410542726516724 sec



Took 170.84360241889954 sec for this epoch
Validation set (epoch 15): Loss: 228.2971, Average loss: 1.0472, Accuracy: 102/218 (46.79%) Took 17.635475397109985 sec

Took 170.0574769973755 sec for this epoch
Validation set (epoch 16): Loss: 226.0839, Average loss: 1.0371, Accuracy: 102/218 (46.79%) Took 17.152417421340942 sec

Took 170.21950340270996 sec for this epoch
Validation set (epoch 17): Loss: 226.8249, Average loss: 1.0405, Accuracy: 102/218 (46.79%) Took 17.22439670562744 sec

Took 169.83249688148499 sec for this epoch
Validation set (epoch 18): Loss: 226.2927, Average loss: 1.0380, Accuracy: 102/218 (46.79%) Took 17.428093433380127 sec



Took 170.23045992851257 sec for this epoch
Validation set (epoch 19): Loss: 227.1256, Average loss: 1.0419, Accuracy: 102/218 (46.79%) Took 17.371288776397705 sec

Took 170.37030482292175 sec for this epoch
Validation set (epoch 20): Loss: 226.4703, Average loss: 1.0389, Accuracy: 102/218 (46.79%) Took 17.359925746917725 sec

Took 171.76086330413818 sec for this epoch
Validation set (epoch 21): Loss: 226.0733, Average loss: 1.0370, Accuracy: 102/218 (46.79%) Took 18.53914737701416 sec

Took 178.47098398208618 sec for this epoch
Validation set (epoch 22): Loss: 226.0664, Average loss: 1.0370, Accuracy: 102/218 (46.79%) Took 18.447630405426025 sec



