In [1]:
from utils.neural_net_tools import *
# from utils.multihead_attention_classifier import *
from utils.multihead_attention_classifier_v2 import *
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')


%load_ext autoreload
%autoreload 2

path_to_data = "dataset/text_molecule.csv"
format_ = "csv"
split_ratio = 5/6
min_freq_words = 1
batch_size = 384
seed = 1997
n_epochs = 200
fix_length = 22 #51 with H and 22 without.

# check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Normal label 1

In [2]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), ('label', LABEL),
          (None, None), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 938 ms


In [3]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l1_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=256,
    num_layers=2,
    heads=8,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=fix_length,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l1_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l1_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l1_MultiheadAttentionClassifier.parameters(), lr=0.0001)

# Specify criterion.
CRIT = nn.MSELoss()

MultiheadAttentionClassifier_train(l1_MultiheadAttentionClassifier,
                                   train_iterator,
                                   valid_iterator,
                                   n_epochs,
                                   OPT,
                                   CRIT,
                                   accuracy_function=None,
                                   save=True,
                                   saving_path="model_bck/l1_MultiheadAttentionClassifier_lr0.0001_B384_D0_GC_false.pt")

MultiheadAttentionClassifier(
  (encoder): Encoder(
    (tok_embedding): Embedding(15, 256)
    (pos_embedding): Embedding(22, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadScaledDotProductAttention(
          (V): Linear(in_features=256, out_features=256, bias=True)
          (K): Linear(in_features=256, out_features=256, bias=True)
          (Q): Linear(in_features=256, out_features=256, bias=True)
          (LayerNormalization): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=256, bias=True)
        )
        (LayerNormalization1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (LayerNormalization2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (linear_augmentation): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )

	No. epoch: 71/200                    
	   Train Loss: 0.005
	    Val. Loss: 0.01


	No. epoch: 72/200                    
	   Train Loss: 0.006
	    Val. Loss: 0.013


	No. epoch: 73/200                    
	   Train Loss: 0.003
	    Val. Loss: 0.01


	No. epoch: 74/200                    
	   Train Loss: 0.002
	    Val. Loss: 0.009


	No. epoch: 75/200                    
	   Train Loss: 0.001
	    Val. Loss: 0.009


	No. epoch: 76/200                    
	   Train Loss: 0.016
	    Val. Loss: 0.015


	No. epoch: 77/200                    
	   Train Loss: 0.006
	    Val. Loss: 0.01


	No. epoch: 78/200                    
	   Train Loss: 0.002
	    Val. Loss: 0.009


	No. epoch: 79/200                    
	   Train Loss: 0.001
	    Val. Loss: 0.009


	No. epoch: 80/200                    
	   Train Loss: 0.002
	    Val. Loss: 0.01


	No. epoch: 81/200                    
	   Train Loss: 0.002
	    Val. Loss: 0.009


	No. epoch: 82/200                    
	   Train Loss: 0.004
	    Val

	No. epoch: 168/200                   
	   Train Loss: 0.0
	    Val. Loss: 0.007


	No. epoch: 169/200                   
	   Train Loss: 0.001
	    Val. Loss: 0.008


	No. epoch: 170/200                   
	   Train Loss: 0.002
	    Val. Loss: 0.008


	No. epoch: 171/200                   
	   Train Loss: 0.003
	    Val. Loss: 0.008


	No. epoch: 172/200                   
	   Train Loss: 0.001
	    Val. Loss: 0.008


	No. epoch: 173/200                   
	   Train Loss: 0.002
	    Val. Loss: 0.007


	No. epoch: 174/200                   
	   Train Loss: 0.001
	    Val. Loss: 0.007


	No. epoch: 175/200                   
	   Train Loss: 0.001
	    Val. Loss: 0.007


	No. epoch: 176/200                   
	   Train Loss: 0.001
	    Val. Loss: 0.007


	No. epoch: 177/200                   
	   Train Loss: 0.0
	    Val. Loss: 0.007


	No. epoch: 178/200                   
	   Train Loss: 0.001
	    Val. Loss: 0.008


	No. epoch: 179/200                   
	   Train Loss: 0.002
	    Val

# Poisson label 2

In [4]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None),
          ('label', LABEL), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 1.19 s


In [5]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l2_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=256,
    num_layers=2,
    heads=8,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=fix_length,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l2_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l2_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l2_MultiheadAttentionClassifier.parameters(), lr=0.0001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

MultiheadAttentionClassifier_train(l2_MultiheadAttentionClassifier,
                                   train_iterator,
                                   valid_iterator,
                                   n_epochs,
                                   OPT,
                                   CRIT,
                                   accuracy_function=poisson_accuracy,
                                   save=True,
                                   saving_path="model_bck/l2_MultiheadAttentionClassifier_lr0.0001_B384_D0_GC_false.pt")

MultiheadAttentionClassifier(
  (encoder): Encoder(
    (tok_embedding): Embedding(15, 256)
    (pos_embedding): Embedding(22, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadScaledDotProductAttention(
          (V): Linear(in_features=256, out_features=256, bias=True)
          (K): Linear(in_features=256, out_features=256, bias=True)
          (Q): Linear(in_features=256, out_features=256, bias=True)
          (LayerNormalization): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=256, bias=True)
        )
        (LayerNormalization1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (LayerNormalization2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (linear_augmentation): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )

	No. epoch: 48/200                    
	   Train Loss: 0.284 | Train Acc: 99.656%
	    Val. Loss: 0.287 |  Val. Acc: 98.711%


	No. epoch: 49/200                    
	   Train Loss: 0.283 | Train Acc: 99.724%
	    Val. Loss: 0.286 |  Val. Acc: 99.162%


	No. epoch: 50/200                    
	   Train Loss: 0.283 | Train Acc: 99.843%
	    Val. Loss: 0.286 |  Val. Acc: 99.409%


	No. epoch: 51/200                    
	   Train Loss: 0.283 | Train Acc: 99.791%
	    Val. Loss: 0.289 |  Val. Acc: 99.219%


	No. epoch: 52/200                    
	   Train Loss: 0.286 | Train Acc: 99.149%
	    Val. Loss: 0.31 |  Val. Acc: 95.433%


	No. epoch: 53/200                    
	   Train Loss: 0.309 | Train Acc: 95.084%
	    Val. Loss: 0.291 |  Val. Acc: 98.234%


	No. epoch: 54/200                    
	   Train Loss: 0.285 | Train Acc: 99.396%
	    Val. Loss: 0.288 |  Val. Acc: 98.787%


	No. epoch: 55/200                    
	   Train Loss: 0.284 | Train Acc: 99.661%
	    Val. Loss: 0.286 |  Val. 

	No. epoch: 113/200                   
	   Train Loss: 0.282 | Train Acc: 99.984%
	    Val. Loss: 0.285 |  Val. Acc: 99.657%


	No. epoch: 114/200                   
	   Train Loss: 0.282 | Train Acc: 99.949%
	    Val. Loss: 0.285 |  Val. Acc: 99.581%


	No. epoch: 115/200                   
	   Train Loss: 0.282 | Train Acc: 99.935%
	    Val. Loss: 0.286 |  Val. Acc: 99.549%


	No. epoch: 116/200                   
	   Train Loss: 0.282 | Train Acc: 99.982%
	    Val. Loss: 0.286 |  Val. Acc: 99.593%


	No. epoch: 117/200                   
	   Train Loss: 0.282 | Train Acc: 99.992%
	    Val. Loss: 0.286 |  Val. Acc: 99.632%


	No. epoch: 118/200                   
	   Train Loss: 0.281 | Train Acc: 99.99%
	    Val. Loss: 0.286 |  Val. Acc: 99.632%


	No. epoch: 119/200                   
	   Train Loss: 0.281 | Train Acc: 99.977%
	    Val. Loss: 0.286 |  Val. Acc: 99.282%


	No. epoch: 120/200                   
	   Train Loss: 0.306 | Train Acc: 95.615%
	    Val. Loss: 0.289 |  Val. 

	No. epoch: 178/200                   
	   Train Loss: 0.281 | Train Acc: 99.997%
	    Val. Loss: 0.285 |  Val. Acc: 99.682%


	No. epoch: 179/200                   
	   Train Loss: 0.281 | Train Acc: 99.994%
	    Val. Loss: 0.285 |  Val. Acc: 99.619%


	No. epoch: 180/200                   
	   Train Loss: 0.282 | Train Acc: 99.965%
	    Val. Loss: 0.285 |  Val. Acc: 99.657%


	No. epoch: 181/200                   
	   Train Loss: 0.284 | Train Acc: 99.319%
	    Val. Loss: 0.294 |  Val. Acc: 96.945%


	No. epoch: 182/200                   
	   Train Loss: 0.289 | Train Acc: 98.358%
	    Val. Loss: 0.288 |  Val. Acc: 99.06%


	No. epoch: 183/200                   
	   Train Loss: 0.283 | Train Acc: 99.749%
	    Val. Loss: 0.286 |  Val. Acc: 99.149%


	No. epoch: 184/200                   
	   Train Loss: 0.282 | Train Acc: 99.878%
	    Val. Loss: 0.285 |  Val. Acc: 99.587%


	No. epoch: 185/200                   
	   Train Loss: 0.282 | Train Acc: 99.974%
	    Val. Loss: 0.285 |  Val. 

# Normal label 3

In [6]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None),
          ('label', LABEL), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 1.2 s


In [7]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l3_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=256,
    num_layers=2,
    heads=8,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=fix_length,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l3_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l3_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l3_MultiheadAttentionClassifier.parameters(), lr=0.00001)

# Specify criterion.
CRIT = nn.MSELoss()

MultiheadAttentionClassifier_train(l3_MultiheadAttentionClassifier,
                                   train_iterator,
                                   valid_iterator,
                                   n_epochs,
                                   OPT,
                                   CRIT,
                                   clipping_value=None,
                                   accuracy_function=None,
                                   save=True,
                                   saving_path="model_bck/l3_MultiheadAttentionClassifier_lr0.0001_B384_D0_GC_false.pt")

MultiheadAttentionClassifier(
  (encoder): Encoder(
    (tok_embedding): Embedding(15, 256)
    (pos_embedding): Embedding(22, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadScaledDotProductAttention(
          (V): Linear(in_features=256, out_features=256, bias=True)
          (K): Linear(in_features=256, out_features=256, bias=True)
          (Q): Linear(in_features=256, out_features=256, bias=True)
          (LayerNormalization): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=256, bias=True)
        )
        (LayerNormalization1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (LayerNormalization2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (linear_augmentation): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )

	No. epoch: 70/200                    
	   Train Loss: 0.451
	    Val. Loss: 0.133


	No. epoch: 71/200                    
	   Train Loss: 0.083
	    Val. Loss: 0.111


	No. epoch: 72/200                    
	   Train Loss: 0.073
	    Val. Loss: 0.104


	No. epoch: 73/200                    
	   Train Loss: 0.067
	    Val. Loss: 0.1


	No. epoch: 74/200                    
	   Train Loss: 0.065
	    Val. Loss: 0.092


	No. epoch: 75/200                    
	   Train Loss: 0.062
	    Val. Loss: 0.092


	No. epoch: 76/200                    
	   Train Loss: 0.062
	    Val. Loss: 0.101


	No. epoch: 77/200                    
	   Train Loss: 0.093
	    Val. Loss: 0.169


	No. epoch: 78/200                    
	   Train Loss: 0.176
	    Val. Loss: 0.817


	No. epoch: 79/200                    
	   Train Loss: 0.25
	    Val. Loss: 0.103


	No. epoch: 80/200                    
	   Train Loss: 0.178
	    Val. Loss: 0.297


	No. epoch: 81/200                    
	   Train Loss: 0.947
	    Va

	No. epoch: 167/200                   
	   Train Loss: 0.014
	    Val. Loss: 0.035


	No. epoch: 168/200                   
	   Train Loss: 0.014
	    Val. Loss: 0.037


	No. epoch: 169/200                   
	   Train Loss: 0.013
	    Val. Loss: 0.036


	No. epoch: 170/200                   
	   Train Loss: 0.055
	    Val. Loss: 0.664


	No. epoch: 171/200                   
	   Train Loss: 0.296
	    Val. Loss: 0.046


	No. epoch: 172/200                   
	   Train Loss: 0.019
	    Val. Loss: 0.042


	No. epoch: 173/200                   
	   Train Loss: 0.016
	    Val. Loss: 0.036


	No. epoch: 174/200                   
	   Train Loss: 0.034
	    Val. Loss: 0.071


	No. epoch: 175/200                   
	   Train Loss: 0.195
	    Val. Loss: 0.356


	No. epoch: 176/200                   
	   Train Loss: 0.308
	    Val. Loss: 0.094


	No. epoch: 177/200                   
	   Train Loss: 0.04
	    Val. Loss: 0.038


	No. epoch: 178/200                   
	   Train Loss: 0.016
	    

# Poisson label 4

In [8]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None),
          (None, None), ('label', LABEL)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 1.2 s


In [9]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l4_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=256,
    num_layers=2,
    heads=8,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=fix_length,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l4_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l4_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l4_MultiheadAttentionClassifier.parameters(), lr=0.0001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

MultiheadAttentionClassifier_train(l4_MultiheadAttentionClassifier,
                                   train_iterator,
                                   valid_iterator,
                                   n_epochs,
                                   OPT,
                                   CRIT,
                                   accuracy_function=poisson_accuracy,
                                   save=True,
                                   saving_path="model_bck/l4_MultiheadAttentionClassifier_lr0.0001_B384_D0_GC_false.pt")

MultiheadAttentionClassifier(
  (encoder): Encoder(
    (tok_embedding): Embedding(15, 256)
    (pos_embedding): Embedding(22, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadScaledDotProductAttention(
          (V): Linear(in_features=256, out_features=256, bias=True)
          (K): Linear(in_features=256, out_features=256, bias=True)
          (Q): Linear(in_features=256, out_features=256, bias=True)
          (LayerNormalization): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=256, bias=True)
        )
        (LayerNormalization1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (LayerNormalization2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (linear_augmentation): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )

	No. epoch: 48/200                    
	   Train Loss: 0.23 | Train Acc: 99.977%
	    Val. Loss: 0.151 |  Val. Acc: 99.061%


	No. epoch: 49/200                    
	   Train Loss: 0.23 | Train Acc: 99.996%
	    Val. Loss: 0.151 |  Val. Acc: 98.981%


	No. epoch: 50/200                    
	   Train Loss: 0.23 | Train Acc: 99.921%
	    Val. Loss: 0.152 |  Val. Acc: 98.38%


	No. epoch: 51/200                    
	   Train Loss: 0.232 | Train Acc: 99.096%
	    Val. Loss: 0.151 |  Val. Acc: 98.966%


	No. epoch: 52/200                    
	   Train Loss: 0.23 | Train Acc: 99.895%
	    Val. Loss: 0.151 |  Val. Acc: 98.873%


	No. epoch: 53/200                    
	   Train Loss: 0.231 | Train Acc: 99.853%
	    Val. Loss: 0.153 |  Val. Acc: 98.098%


	No. epoch: 54/200                    
	   Train Loss: 0.231 | Train Acc: 99.376%
	    Val. Loss: 0.151 |  Val. Acc: 99.137%


	No. epoch: 55/200                    
	   Train Loss: 0.23 | Train Acc: 99.979%
	    Val. Loss: 0.151 |  Val. Acc: 

	No. epoch: 114/200                   
	   Train Loss: 0.23 | Train Acc: 99.996%
	    Val. Loss: 0.15 |  Val. Acc: 99.131%


	No. epoch: 115/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.15%


	No. epoch: 116/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.076%


	No. epoch: 117/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.083%


	No. epoch: 118/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.083%


	No. epoch: 119/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.137%


	No. epoch: 120/200                   
	   Train Loss: 0.23 | Train Acc: 99.999%
	    Val. Loss: 0.15 |  Val. Acc: 99.083%


	No. epoch: 121/200                   
	   Train Loss: 0.23 | Train Acc: 99.999%
	    Val. Loss: 0.15 |  Val. Acc: 99.089%


	No. e

	No. epoch: 180/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.379%


	No. epoch: 181/200                   
	   Train Loss: 0.23 | Train Acc: 99.983%
	    Val. Loss: 0.15 |  Val. Acc: 99.262%


	No. epoch: 182/200                   
	   Train Loss: 0.23 | Train Acc: 99.992%
	    Val. Loss: 0.15 |  Val. Acc: 99.36%


	No. epoch: 183/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.245%


	No. epoch: 184/200                   
	   Train Loss: 0.23 | Train Acc: 99.99%
	    Val. Loss: 0.15 |  Val. Acc: 99.245%


	No. epoch: 185/200                   
	   Train Loss: 0.23 | Train Acc: 99.999%
	    Val. Loss: 0.15 |  Val. Acc: 99.3%


	No. epoch: 186/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.347%


	No. epoch: 187/200                   
	   Train Loss: 0.23 | Train Acc: 100.0%
	    Val. Loss: 0.15 |  Val. Acc: 99.274%


	No. epo

In [10]:
# model = torch.load("model_bck/l1_MultiheadAttentionClassifier.pt")
# model.eval()