In [1]:
from functools import partial

import torch
from torch.optim import Adam
import pandas as pd

from data import Vocabulary, SentiDataset
from models import WordAveragingModel, AttentionWeightedWordAveragingModel, CosineSimilarityAttention, dot_product_self_attention, MultiHeadSelfAttentionModel
from learner import SentimentLearner

torch.manual_seed(41)

<torch._C.Generator at 0x7f6d246fa168>

### Hyperparameters

In [2]:
BATCH_SIZE = 256
EMBED_DIM = 256
DROPOUT = 0.25
OPTIM_CLS = Adam

### Load data

In [3]:
FILENAME = 'senti.{}.tsv'

vocab = Vocabulary()

train_set = SentiDataset(FILENAME.format('train'), tokenizer=vocab)
valid_set = SentiDataset(FILENAME.format('dev'), tokenizer=vocab)
test_set = SentiDataset(FILENAME.format('test'), tokenizer=vocab)

get_learner = partial(SentimentLearner, train_set=train_set, valid_set=valid_set, test_set=test_set, batch_size=BATCH_SIZE, optim_cls=OPTIM_CLS)

# Word averaging model

### Load model and learner

In [4]:
word_avg = WordAveragingModel(len(vocab), embed_dim=EMBED_DIM, embed_dropout=DROPOUT)
learner = get_learner(model=word_avg, lr=2e-4)

### Train model

In [5]:
word_avg_filename = 'word_avg.pt'
learner.train(epochs=20, filename=word_avg_filename)

Epoch 01: 100%|██████████| 264/264 [00:03<00:00, 87.83it/s, Loss=0.657, Acc=0.667]


	Train Loss: 0.663	Train Acc: 59.85%
	Valid Loss: 0.654	Valid Acc: 58.72%
	Model parameters saved to word_avg.pt


Epoch 02: 100%|██████████| 264/264 [00:02<00:00, 88.65it/s, Loss=0.508, Acc=0.905]


	Train Loss: 0.563	Train Acc: 75.66%
	Valid Loss: 0.583	Valid Acc: 74.20%
	Model parameters saved to word_avg.pt


Epoch 03: 100%|██████████| 264/264 [00:02<00:00, 91.51it/s, Loss=0.463, Acc=0.857]


	Train Loss: 0.458	Train Acc: 84.90%
	Valid Loss: 0.529	Valid Acc: 78.33%
	Model parameters saved to word_avg.pt


Epoch 04: 100%|██████████| 264/264 [00:02<00:00, 88.68it/s, Loss=0.382, Acc=0.857]


	Train Loss: 0.382	Train Acc: 88.05%
	Valid Loss: 0.492	Valid Acc: 79.93%
	Model parameters saved to word_avg.pt


Epoch 05: 100%|██████████| 264/264 [00:02<00:00, 89.58it/s, Loss=0.258, Acc=0.952]


	Train Loss: 0.330	Train Acc: 89.72%
	Valid Loss: 0.466	Valid Acc: 80.28%
	Model parameters saved to word_avg.pt


Epoch 06: 100%|██████████| 264/264 [00:03<00:00, 87.93it/s, Loss=0.23, Acc=0.952] 


	Train Loss: 0.293	Train Acc: 90.68%
	Valid Loss: 0.449	Valid Acc: 80.73%
	Model parameters saved to word_avg.pt


Epoch 07: 100%|██████████| 264/264 [00:03<00:00, 87.49it/s, Loss=0.239, Acc=0.857]


	Train Loss: 0.265	Train Acc: 91.52%
	Valid Loss: 0.436	Valid Acc: 80.96%
	Model parameters saved to word_avg.pt


Epoch 08: 100%|██████████| 264/264 [00:02<00:00, 94.75it/s, Loss=0.12, Acc=1]     


	Train Loss: 0.244	Train Acc: 92.13%
	Valid Loss: 0.428	Valid Acc: 81.19%
	Model parameters saved to word_avg.pt


Epoch 09: 100%|██████████| 264/264 [00:03<00:00, 82.95it/s, Loss=0.161, Acc=0.952]


	Train Loss: 0.227	Train Acc: 92.66%
	Valid Loss: 0.422	Valid Acc: 80.96%
	Model parameters saved to word_avg.pt


Epoch 10: 100%|██████████| 264/264 [00:02<00:00, 91.94it/s, Loss=0.193, Acc=0.952]


	Train Loss: 0.213	Train Acc: 93.02%
	Valid Loss: 0.419	Valid Acc: 81.42%
	Model parameters saved to word_avg.pt


Epoch 11: 100%|██████████| 264/264 [00:02<00:00, 91.31it/s, Loss=0.183, Acc=1]    


	Train Loss: 0.201	Train Acc: 93.31%
	Valid Loss: 0.417	Valid Acc: 80.96%
	Model parameters saved to word_avg.pt


Epoch 12: 100%|██████████| 264/264 [00:03<00:00, 86.34it/s, Loss=0.0741, Acc=1]   


	Train Loss: 0.191	Train Acc: 93.60%
	Valid Loss: 0.417	Valid Acc: 81.42%
	Model parameters saved to word_avg.pt


Epoch 13: 100%|██████████| 264/264 [00:02<00:00, 91.09it/s, Loss=0.249, Acc=0.905]


	Train Loss: 0.182	Train Acc: 93.83%
	Valid Loss: 0.418	Valid Acc: 81.19%


Epoch 14: 100%|██████████| 264/264 [00:03<00:00, 85.83it/s, Loss=0.393, Acc=0.905]


	Train Loss: 0.175	Train Acc: 94.04%
	Valid Loss: 0.419	Valid Acc: 81.19%


Epoch 15: 100%|██████████| 264/264 [00:03<00:00, 87.80it/s, Loss=0.172, Acc=0.857]


	Train Loss: 0.168	Train Acc: 94.27%
	Valid Loss: 0.422	Valid Acc: 81.31%


Epoch 16: 100%|██████████| 264/264 [00:03<00:00, 80.98it/s, Loss=0.0914, Acc=1]   


	Train Loss: 0.162	Train Acc: 94.43%
	Valid Loss: 0.426	Valid Acc: 81.31%


Epoch 17: 100%|██████████| 264/264 [00:02<00:00, 92.15it/s, Loss=0.131, Acc=0.952]


	Train Loss: 0.157	Train Acc: 94.57%
	Valid Loss: 0.429	Valid Acc: 81.31%


Epoch 18: 100%|██████████| 264/264 [00:02<00:00, 88.13it/s, Loss=0.0318, Acc=1]   


	Train Loss: 0.152	Train Acc: 94.70%
	Valid Loss: 0.433	Valid Acc: 81.31%


Epoch 19: 100%|██████████| 264/264 [00:03<00:00, 87.06it/s, Loss=0.066, Acc=0.952] 


	Train Loss: 0.147	Train Acc: 94.87%
	Valid Loss: 0.438	Valid Acc: 81.42%


Epoch 20: 100%|██████████| 264/264 [00:03<00:00, 85.48it/s, Loss=0.112, Acc=0.952]


	Train Loss: 0.143	Train Acc: 94.97%
	Valid Loss: 0.445	Valid Acc: 81.54%


### Load best model to evaluate

In [6]:
learner.load_model_params(word_avg_filename)

In [7]:
learner.print_test_results()

	 Test Loss: 0.408	 Test Acc: 81.38%


## Norm of word embeddings

In [8]:
word_embedding = learner.model.word_embedding
norms = pd.Series(torch.linalg.norm(word_embedding, dim=1).cpu(), index=list(vocab.itos)).sort_values()

In [9]:
norms.tail(15)[::-1]

worst         4.024570
suffers       3.295249
mess          3.271896
lacking       3.261830
remarkable    3.245809
lacks         3.195105
flat          3.192154
powerful      3.182127
devoid        3.166308
hilarious     3.112350
captures      3.098900
touching      3.046976
waste         2.944949
stupid        2.901314
terrific      2.898190
dtype: float32

In [10]:
norms.head(15)

<pad>           0.000000
<unk>           0.017731
jesus           0.067001
non-bondish     0.067320
fustily         0.078295
reviews         0.079019
grounding       0.081172
malapropisms    0.081883
margarita       0.082631
fuelled         0.083875
freud           0.084031
bearing         0.086743
presentation    0.087168
by-the-book     0.088188
wattage         0.088215
dtype: float32

# Attention weighted word averaging model
## w/ cosine similarity attention

### Load model and learner

In [11]:
cos_att = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=CosineSimilarityAttention(EMBED_DIM), embed_dropout=DROPOUT)
learner = get_learner(model=cos_att, lr=1e-4)

### Train model

In [12]:
cos_att_filename = 'cos_att.pt'
learner.train(epochs=20, filename=cos_att_filename)

Epoch 01: 100%|██████████| 264/264 [00:03<00:00, 81.49it/s, Loss=0.643, Acc=0.857]


	Train Loss: 0.679	Train Acc: 67.02%
	Valid Loss: 0.664	Valid Acc: 72.13%
	Model parameters saved to cos_att.pt


Epoch 02: 100%|██████████| 264/264 [00:03<00:00, 76.41it/s, Loss=0.536, Acc=0.857]


	Train Loss: 0.606	Train Acc: 78.17%
	Valid Loss: 0.611	Valid Acc: 70.18%
	Model parameters saved to cos_att.pt


Epoch 03: 100%|██████████| 264/264 [00:03<00:00, 75.47it/s, Loss=0.515, Acc=0.81] 


	Train Loss: 0.522	Train Acc: 79.58%
	Valid Loss: 0.565	Valid Acc: 72.48%
	Model parameters saved to cos_att.pt


Epoch 04: 100%|██████████| 264/264 [00:03<00:00, 77.59it/s, Loss=0.549, Acc=0.667]


	Train Loss: 0.455	Train Acc: 82.74%
	Valid Loss: 0.530	Valid Acc: 74.89%
	Model parameters saved to cos_att.pt


Epoch 05: 100%|██████████| 264/264 [00:03<00:00, 77.17it/s, Loss=0.463, Acc=0.81] 


	Train Loss: 0.405	Train Acc: 85.38%
	Valid Loss: 0.503	Valid Acc: 75.80%
	Model parameters saved to cos_att.pt


Epoch 06: 100%|██████████| 264/264 [00:03<00:00, 75.63it/s, Loss=0.432, Acc=0.81] 


	Train Loss: 0.365	Train Acc: 87.28%
	Valid Loss: 0.484	Valid Acc: 77.64%
	Model parameters saved to cos_att.pt


Epoch 07: 100%|██████████| 264/264 [00:03<00:00, 78.78it/s, Loss=0.232, Acc=0.952]


	Train Loss: 0.333	Train Acc: 88.65%
	Valid Loss: 0.470	Valid Acc: 78.78%
	Model parameters saved to cos_att.pt


Epoch 08: 100%|██████████| 264/264 [00:03<00:00, 79.99it/s, Loss=0.491, Acc=0.619]


	Train Loss: 0.307	Train Acc: 89.57%
	Valid Loss: 0.460	Valid Acc: 79.70%
	Model parameters saved to cos_att.pt


Epoch 09: 100%|██████████| 264/264 [00:03<00:00, 78.34it/s, Loss=0.241, Acc=0.905]


	Train Loss: 0.286	Train Acc: 90.19%
	Valid Loss: 0.452	Valid Acc: 79.70%
	Model parameters saved to cos_att.pt


Epoch 10: 100%|██████████| 264/264 [00:03<00:00, 77.23it/s, Loss=0.271, Acc=0.952]


	Train Loss: 0.269	Train Acc: 90.69%
	Valid Loss: 0.447	Valid Acc: 79.70%
	Model parameters saved to cos_att.pt


Epoch 11: 100%|██████████| 264/264 [00:03<00:00, 83.04it/s, Loss=0.218, Acc=0.952]


	Train Loss: 0.254	Train Acc: 91.07%
	Valid Loss: 0.444	Valid Acc: 79.93%
	Model parameters saved to cos_att.pt


Epoch 12: 100%|██████████| 264/264 [00:03<00:00, 79.07it/s, Loss=0.42, Acc=0.714] 


	Train Loss: 0.241	Train Acc: 91.42%
	Valid Loss: 0.441	Valid Acc: 79.82%
	Model parameters saved to cos_att.pt


Epoch 13: 100%|██████████| 264/264 [00:03<00:00, 79.61it/s, Loss=0.216, Acc=0.905]


	Train Loss: 0.230	Train Acc: 91.64%
	Valid Loss: 0.439	Valid Acc: 79.82%
	Model parameters saved to cos_att.pt


Epoch 14: 100%|██████████| 264/264 [00:03<00:00, 77.42it/s, Loss=0.157, Acc=0.952]


	Train Loss: 0.221	Train Acc: 91.93%
	Valid Loss: 0.439	Valid Acc: 79.93%
	Model parameters saved to cos_att.pt


Epoch 15: 100%|██████████| 264/264 [00:03<00:00, 77.37it/s, Loss=0.0798, Acc=1]   


	Train Loss: 0.212	Train Acc: 92.14%
	Valid Loss: 0.438	Valid Acc: 80.05%
	Model parameters saved to cos_att.pt


Epoch 16: 100%|██████████| 264/264 [00:03<00:00, 76.08it/s, Loss=0.375, Acc=0.905]


	Train Loss: 0.205	Train Acc: 92.37%
	Valid Loss: 0.439	Valid Acc: 79.93%


Epoch 17: 100%|██████████| 264/264 [00:03<00:00, 77.70it/s, Loss=0.535, Acc=0.762]


	Train Loss: 0.198	Train Acc: 92.62%
	Valid Loss: 0.439	Valid Acc: 80.50%


Epoch 18: 100%|██████████| 264/264 [00:03<00:00, 82.21it/s, Loss=0.164, Acc=0.905]


	Train Loss: 0.191	Train Acc: 92.83%
	Valid Loss: 0.439	Valid Acc: 80.50%


Epoch 19: 100%|██████████| 264/264 [00:03<00:00, 76.42it/s, Loss=0.0966, Acc=1]   


	Train Loss: 0.186	Train Acc: 93.02%
	Valid Loss: 0.440	Valid Acc: 80.39%


Epoch 20: 100%|██████████| 264/264 [00:03<00:00, 77.74it/s, Loss=0.109, Acc=0.952]


	Train Loss: 0.180	Train Acc: 93.23%
	Valid Loss: 0.441	Valid Acc: 80.39%


### Load best model to evaluate

In [13]:
learner.load_model_params(cos_att_filename)

In [14]:
learner.print_test_results()

	 Test Loss: 0.434	 Test Acc: 79.24%


## Cosine similarities between vector u and word embeddings

In [15]:
att_layer = learner.model.attention
embeddings = learner.model.embedding.weight.data

cosine_similarities = pd.Series(att_layer.cosine_similarity_to_u(embeddings).detach().cpu(), index=list(vocab.itos)).sort_values()

In [16]:
cosine_similarities.tail(15)[::-1]

never              0.996969
disappoint         0.992830
ignored            0.992009
patriotic          0.991844
bowl               0.991654
talkiness          0.989554
afraid             0.989487
mcadams            0.989437
rah-rah            0.988890
superman           0.988618
broadcast          0.988453
unintentionally    0.988083
moan               0.987819
miss               0.987211
wrong              0.986977
dtype: float32

In [17]:
cosine_similarities.head(15)

quick         -0.999458
adults        -0.999302
have          -0.999283
there         -0.999252
jones         -0.999176
rather        -0.999142
time          -0.999133
back          -0.999114
narratively   -0.999103
making        -0.999089
understand    -0.999076
segment       -0.999049
veiling       -0.999038
begins        -0.999025
bit           -0.999008
dtype: float32

## Attention variance among frequent words in the training set

In [18]:
MAX_FREQ = 100
freq = vocab.df
upper_bound = freq[freq['freq'] >= MAX_FREQ].index[-1]
print(upper_bound)

669


In [19]:
from collections import defaultdict

@torch.no_grad()
def get_attention_stats(model):
    attentions = defaultdict(list)
    
    embedding_layer = model.embedding
    dropout_layer = model.embed_dropout
    attention_layer = model.attention
    for tokenized, _ in learner.train_loader:
        input_ids = tokenized['input_ids'].to(learner.device)
        attention_mask = tokenized['attention_mask'].to(learner.device)
        
        embedded = dropout_layer(embedding_layer(input_ids))
        attention = attention_layer(embedded, attention_mask)
        
        mask = torch.where(input_ids <= upper_bound, input_ids, 0).bool()  # Mask less frequent words as well as pads
        masked_input_ids = torch.masked_select(input_ids, mask).tolist()
        masked_attention = torch.masked_select(attention, mask).tolist()
        
        for i, att in zip(masked_input_ids, masked_attention):
            attentions[i].append(att)
        
    return attentions

In [20]:
stats = get_attention_stats(learner.model)
df = pd.DataFrame(columns=['word', 'mean', 'std'])
for k, v in stats.items():
    attentions = torch.Tensor(v)
    df = df.append({'word': vocab.itos[k], 'mean': attentions.mean().item(), 'std': attentions.std().item()}, ignore_index=True)
print(len(df))  # indexed from 2 ~ upper_bound, should be upper_bound - 2 + 1

668


In [21]:
df = df.sort_values('std', ascending=False)
df = df.set_index('word', drop=True)
df.head(30)

Unnamed: 0_level_0,mean,std
word,Unnamed: 1_level_1,Unnamed: 2_level_1
bland,0.346257,0.26582
stupid,0.385331,0.265657
awful,0.413376,0.255618
painful,0.372556,0.252404
tedious,0.325079,0.248903
flat,0.327273,0.245703
creepy,0.332419,0.236097
worse,0.296377,0.235868
waste,0.378752,0.235385
unfunny,0.335676,0.233512


# Attention weighted word averaging model
## w/ dot product self-attention

### Load model and learner

In [22]:
dp_att = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=dot_product_self_attention, embed_dropout=DROPOUT)
learner = get_learner(model=dp_att, lr=5e-5)

### Train model

In [23]:
dp_att_filename = 'dp_att.pt'
learner.train(epochs=20, filename=dp_att_filename)

Epoch 01: 100%|██████████| 264/264 [00:03<00:00, 81.42it/s, Loss=0.676, Acc=0.619]


	Train Loss: 0.687	Train Acc: 55.88%
	Valid Loss: 0.688	Valid Acc: 51.03%
	Model parameters saved to dp_att.pt


Epoch 02: 100%|██████████| 264/264 [00:03<00:00, 78.62it/s, Loss=0.674, Acc=0.667]


	Train Loss: 0.671	Train Acc: 58.85%
	Valid Loss: 0.672	Valid Acc: 55.16%
	Model parameters saved to dp_att.pt


Epoch 03: 100%|██████████| 264/264 [00:03<00:00, 84.46it/s, Loss=0.602, Acc=0.619]


	Train Loss: 0.641	Train Acc: 67.97%
	Valid Loss: 0.633	Valid Acc: 67.55%
	Model parameters saved to dp_att.pt


Epoch 04: 100%|██████████| 264/264 [00:03<00:00, 78.94it/s, Loss=0.56, Acc=0.81]  


	Train Loss: 0.594	Train Acc: 75.65%
	Valid Loss: 0.580	Valid Acc: 74.08%
	Model parameters saved to dp_att.pt


Epoch 05: 100%|██████████| 264/264 [00:03<00:00, 82.61it/s, Loss=0.443, Acc=0.905]


	Train Loss: 0.543	Train Acc: 79.29%
	Valid Loss: 0.541	Valid Acc: 75.57%
	Model parameters saved to dp_att.pt


Epoch 06: 100%|██████████| 264/264 [00:03<00:00, 80.92it/s, Loss=0.529, Acc=0.857]


	Train Loss: 0.498	Train Acc: 81.57%
	Valid Loss: 0.513	Valid Acc: 77.06%
	Model parameters saved to dp_att.pt


Epoch 07: 100%|██████████| 264/264 [00:03<00:00, 79.02it/s, Loss=0.374, Acc=0.81] 


	Train Loss: 0.458	Train Acc: 83.58%
	Valid Loss: 0.493	Valid Acc: 78.10%
	Model parameters saved to dp_att.pt


Epoch 08: 100%|██████████| 264/264 [00:03<00:00, 77.66it/s, Loss=0.325, Acc=0.952]


	Train Loss: 0.423	Train Acc: 85.05%
	Valid Loss: 0.478	Valid Acc: 77.29%
	Model parameters saved to dp_att.pt


Epoch 09: 100%|██████████| 264/264 [00:03<00:00, 84.68it/s, Loss=0.612, Acc=0.667]


	Train Loss: 0.392	Train Acc: 86.32%
	Valid Loss: 0.467	Valid Acc: 78.33%
	Model parameters saved to dp_att.pt


Epoch 10: 100%|██████████| 264/264 [00:03<00:00, 80.22it/s, Loss=0.39, Acc=0.857] 


	Train Loss: 0.366	Train Acc: 87.39%
	Valid Loss: 0.459	Valid Acc: 78.67%
	Model parameters saved to dp_att.pt


Epoch 11: 100%|██████████| 264/264 [00:03<00:00, 80.07it/s, Loss=0.312, Acc=0.905]


	Train Loss: 0.343	Train Acc: 88.32%
	Valid Loss: 0.453	Valid Acc: 79.01%
	Model parameters saved to dp_att.pt


Epoch 12: 100%|██████████| 264/264 [00:03<00:00, 85.12it/s, Loss=0.39, Acc=0.81]  


	Train Loss: 0.324	Train Acc: 89.04%
	Valid Loss: 0.449	Valid Acc: 79.93%
	Model parameters saved to dp_att.pt


Epoch 13: 100%|██████████| 264/264 [00:03<00:00, 80.85it/s, Loss=0.277, Acc=0.905]


	Train Loss: 0.306	Train Acc: 89.65%
	Valid Loss: 0.447	Valid Acc: 80.28%
	Model parameters saved to dp_att.pt


Epoch 14: 100%|██████████| 264/264 [00:03<00:00, 79.43it/s, Loss=0.261, Acc=0.905]


	Train Loss: 0.291	Train Acc: 90.17%
	Valid Loss: 0.446	Valid Acc: 80.50%
	Model parameters saved to dp_att.pt


Epoch 15: 100%|██████████| 264/264 [00:03<00:00, 79.23it/s, Loss=0.289, Acc=0.81] 


	Train Loss: 0.277	Train Acc: 90.54%
	Valid Loss: 0.447	Valid Acc: 80.50%


Epoch 16: 100%|██████████| 264/264 [00:03<00:00, 82.67it/s, Loss=0.395, Acc=0.905]


	Train Loss: 0.265	Train Acc: 90.94%
	Valid Loss: 0.448	Valid Acc: 80.28%


Epoch 17: 100%|██████████| 264/264 [00:03<00:00, 83.59it/s, Loss=0.217, Acc=0.952]


	Train Loss: 0.254	Train Acc: 91.30%
	Valid Loss: 0.450	Valid Acc: 80.28%


Epoch 18: 100%|██████████| 264/264 [00:03<00:00, 79.70it/s, Loss=0.197, Acc=0.952]


	Train Loss: 0.244	Train Acc: 91.64%
	Valid Loss: 0.452	Valid Acc: 80.62%


Epoch 19: 100%|██████████| 264/264 [00:03<00:00, 84.77it/s, Loss=0.239, Acc=0.857]


	Train Loss: 0.235	Train Acc: 91.94%
	Valid Loss: 0.455	Valid Acc: 81.08%


Epoch 20: 100%|██████████| 264/264 [00:03<00:00, 80.11it/s, Loss=0.227, Acc=0.905]


	Train Loss: 0.227	Train Acc: 92.17%
	Valid Loss: 0.458	Valid Acc: 81.08%


### Load best model to evaluate

In [24]:
learner.load_model_params(dp_att_filename)

In [25]:
learner.print_test_results()

	 Test Loss: 0.414	 Test Acc: 80.94%


# Attention weighted word averaging model
## w/ dot product self-attention
## adding residual connection

### Load model and learner

In [26]:
dp_att_rc = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=dot_product_self_attention, res_conn=True, embed_dropout=DROPOUT)
learner = get_learner(model=dp_att_rc, lr=5e-5)

### Train model

In [27]:
dp_att_rc_filename = 'dp_att_rc.pt'
learner.train(epochs=20, filename=dp_att_rc_filename)

Epoch 01: 100%|██████████| 264/264 [00:03<00:00, 80.41it/s, Loss=0.675, Acc=0.714]


	Train Loss: 0.688	Train Acc: 57.23%
	Valid Loss: 0.683	Valid Acc: 66.63%
	Model parameters saved to dp_att_rc.pt


Epoch 02: 100%|██████████| 264/264 [00:03<00:00, 74.81it/s, Loss=0.651, Acc=0.714]


	Train Loss: 0.660	Train Acc: 70.23%
	Valid Loss: 0.662	Valid Acc: 59.52%
	Model parameters saved to dp_att_rc.pt


Epoch 03: 100%|██████████| 264/264 [00:03<00:00, 76.75it/s, Loss=0.646, Acc=0.667]


	Train Loss: 0.620	Train Acc: 72.27%
	Valid Loss: 0.630	Valid Acc: 65.71%
	Model parameters saved to dp_att_rc.pt


Epoch 04: 100%|██████████| 264/264 [00:03<00:00, 82.50it/s, Loss=0.55, Acc=0.81]  


	Train Loss: 0.570	Train Acc: 77.27%
	Valid Loss: 0.585	Valid Acc: 73.39%
	Model parameters saved to dp_att_rc.pt


Epoch 05: 100%|██████████| 264/264 [00:03<00:00, 73.64it/s, Loss=0.465, Acc=0.857]


	Train Loss: 0.517	Train Acc: 81.15%
	Valid Loss: 0.544	Valid Acc: 76.83%
	Model parameters saved to dp_att_rc.pt


Epoch 06: 100%|██████████| 264/264 [00:03<00:00, 78.75it/s, Loss=0.461, Acc=0.762]


	Train Loss: 0.467	Train Acc: 83.66%
	Valid Loss: 0.513	Valid Acc: 77.75%
	Model parameters saved to dp_att_rc.pt


Epoch 07: 100%|██████████| 264/264 [00:03<00:00, 75.70it/s, Loss=0.394, Acc=0.857]


	Train Loss: 0.425	Train Acc: 85.50%
	Valid Loss: 0.489	Valid Acc: 78.21%
	Model parameters saved to dp_att_rc.pt


Epoch 08: 100%|██████████| 264/264 [00:03<00:00, 78.33it/s, Loss=0.281, Acc=0.905]


	Train Loss: 0.389	Train Acc: 86.92%
	Valid Loss: 0.472	Valid Acc: 78.90%
	Model parameters saved to dp_att_rc.pt


Epoch 09: 100%|██████████| 264/264 [00:03<00:00, 78.77it/s, Loss=0.283, Acc=0.952]


	Train Loss: 0.358	Train Acc: 88.09%
	Valid Loss: 0.459	Valid Acc: 79.70%
	Model parameters saved to dp_att_rc.pt


Epoch 10: 100%|██████████| 264/264 [00:03<00:00, 79.78it/s, Loss=0.31, Acc=0.81]  


	Train Loss: 0.333	Train Acc: 88.94%
	Valid Loss: 0.450	Valid Acc: 79.82%
	Model parameters saved to dp_att_rc.pt


Epoch 11: 100%|██████████| 264/264 [00:03<00:00, 81.88it/s, Loss=0.299, Acc=0.905]


	Train Loss: 0.311	Train Acc: 89.59%
	Valid Loss: 0.443	Valid Acc: 80.62%
	Model parameters saved to dp_att_rc.pt


Epoch 12: 100%|██████████| 264/264 [00:03<00:00, 81.70it/s, Loss=0.397, Acc=0.762]


	Train Loss: 0.293	Train Acc: 90.20%
	Valid Loss: 0.438	Valid Acc: 80.85%
	Model parameters saved to dp_att_rc.pt


Epoch 13: 100%|██████████| 264/264 [00:03<00:00, 76.65it/s, Loss=0.207, Acc=0.952]


	Train Loss: 0.276	Train Acc: 90.68%
	Valid Loss: 0.435	Valid Acc: 80.96%
	Model parameters saved to dp_att_rc.pt


Epoch 14: 100%|██████████| 264/264 [00:03<00:00, 82.94it/s, Loss=0.351, Acc=0.857]


	Train Loss: 0.262	Train Acc: 91.13%
	Valid Loss: 0.434	Valid Acc: 80.73%
	Model parameters saved to dp_att_rc.pt


Epoch 15: 100%|██████████| 264/264 [00:03<00:00, 78.87it/s, Loss=0.16, Acc=1]     


	Train Loss: 0.250	Train Acc: 91.50%
	Valid Loss: 0.434	Valid Acc: 80.85%
	Model parameters saved to dp_att_rc.pt


Epoch 16: 100%|██████████| 264/264 [00:03<00:00, 79.24it/s, Loss=0.175, Acc=0.952]


	Train Loss: 0.239	Train Acc: 91.88%
	Valid Loss: 0.434	Valid Acc: 80.85%


Epoch 17: 100%|██████████| 264/264 [00:03<00:00, 80.76it/s, Loss=0.122, Acc=1]    


	Train Loss: 0.229	Train Acc: 92.16%
	Valid Loss: 0.436	Valid Acc: 81.08%


Epoch 18: 100%|██████████| 264/264 [00:03<00:00, 78.86it/s, Loss=0.136, Acc=0.952]


	Train Loss: 0.220	Train Acc: 92.44%
	Valid Loss: 0.437	Valid Acc: 81.31%


Epoch 19: 100%|██████████| 264/264 [00:03<00:00, 80.62it/s, Loss=0.183, Acc=0.952]


	Train Loss: 0.212	Train Acc: 92.65%
	Valid Loss: 0.440	Valid Acc: 81.19%


Epoch 20: 100%|██████████| 264/264 [00:03<00:00, 79.65it/s, Loss=0.115, Acc=1]    


	Train Loss: 0.205	Train Acc: 92.85%
	Valid Loss: 0.442	Valid Acc: 81.31%


### Load best model to evaluate

In [28]:
learner.load_model_params(dp_att_rc_filename)

In [29]:
learner.print_test_results()

	 Test Loss: 0.405	 Test Acc: 81.88%


# Transformer style attention model
## w/ single attention head

### Load model and learner

In [30]:
single_head = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=1, embed_dropout=DROPOUT, attention_dropout=DROPOUT)
learner = get_learner(model=single_head, lr=5e-6)

### Train model

In [31]:
single_head_filename = 'single_head.pt'
learner.train(epochs=20, filename=single_head_filename)

Epoch 01: 100%|██████████| 264/264 [00:04<00:00, 52.97it/s, Loss=0.778, Acc=0.333]


	Train Loss: 0.793	Train Acc: 44.24%
	Valid Loss: 0.714	Valid Acc: 49.08%
	Model parameters saved to single_head.pt


Epoch 02: 100%|██████████| 264/264 [00:04<00:00, 53.48it/s, Loss=0.596, Acc=0.81] 


	Train Loss: 0.682	Train Acc: 56.32%
	Valid Loss: 0.660	Valid Acc: 61.47%
	Model parameters saved to single_head.pt


Epoch 03: 100%|██████████| 264/264 [00:05<00:00, 52.46it/s, Loss=0.578, Acc=0.714]


	Train Loss: 0.612	Train Acc: 69.16%
	Valid Loss: 0.628	Valid Acc: 59.75%
	Model parameters saved to single_head.pt


Epoch 04: 100%|██████████| 264/264 [00:05<00:00, 51.80it/s, Loss=0.541, Acc=0.714]


	Train Loss: 0.536	Train Acc: 75.88%
	Valid Loss: 0.568	Valid Acc: 70.76%
	Model parameters saved to single_head.pt


Epoch 05: 100%|██████████| 264/264 [00:04<00:00, 54.11it/s, Loss=0.427, Acc=0.762]


	Train Loss: 0.451	Train Acc: 83.81%
	Valid Loss: 0.511	Valid Acc: 77.87%
	Model parameters saved to single_head.pt


Epoch 06: 100%|██████████| 264/264 [00:04<00:00, 55.12it/s, Loss=0.346, Acc=0.905]


	Train Loss: 0.377	Train Acc: 87.46%
	Valid Loss: 0.471	Valid Acc: 79.59%
	Model parameters saved to single_head.pt


Epoch 07: 100%|██████████| 264/264 [00:04<00:00, 53.15it/s, Loss=0.356, Acc=0.857]


	Train Loss: 0.324	Train Acc: 89.25%
	Valid Loss: 0.443	Valid Acc: 80.85%
	Model parameters saved to single_head.pt


Epoch 08: 100%|██████████| 264/264 [00:04<00:00, 54.75it/s, Loss=0.279, Acc=0.905]


	Train Loss: 0.285	Train Acc: 90.53%
	Valid Loss: 0.425	Valid Acc: 81.42%
	Model parameters saved to single_head.pt


Epoch 09: 100%|██████████| 264/264 [00:05<00:00, 51.21it/s, Loss=0.237, Acc=0.952]


	Train Loss: 0.257	Train Acc: 91.28%
	Valid Loss: 0.415	Valid Acc: 81.77%
	Model parameters saved to single_head.pt


Epoch 10: 100%|██████████| 264/264 [00:04<00:00, 53.52it/s, Loss=0.305, Acc=0.952]


	Train Loss: 0.235	Train Acc: 91.94%
	Valid Loss: 0.409	Valid Acc: 82.11%
	Model parameters saved to single_head.pt


Epoch 11: 100%|██████████| 264/264 [00:04<00:00, 55.51it/s, Loss=0.0916, Acc=1]   


	Train Loss: 0.217	Train Acc: 92.51%
	Valid Loss: 0.407	Valid Acc: 81.77%
	Model parameters saved to single_head.pt


Epoch 12: 100%|██████████| 264/264 [00:04<00:00, 52.93it/s, Loss=0.124, Acc=0.952]


	Train Loss: 0.203	Train Acc: 92.97%
	Valid Loss: 0.407	Valid Acc: 81.88%


Epoch 13: 100%|██████████| 264/264 [00:04<00:00, 53.87it/s, Loss=0.178, Acc=0.952]


	Train Loss: 0.192	Train Acc: 93.33%
	Valid Loss: 0.410	Valid Acc: 82.00%


Epoch 14: 100%|██████████| 264/264 [00:04<00:00, 53.55it/s, Loss=0.116, Acc=1]    


	Train Loss: 0.182	Train Acc: 93.60%
	Valid Loss: 0.413	Valid Acc: 81.77%


Epoch 15: 100%|██████████| 264/264 [00:04<00:00, 54.94it/s, Loss=0.404, Acc=0.905]


	Train Loss: 0.174	Train Acc: 93.99%
	Valid Loss: 0.419	Valid Acc: 81.65%


Epoch 16: 100%|██████████| 264/264 [00:04<00:00, 53.72it/s, Loss=0.112, Acc=1]    


	Train Loss: 0.166	Train Acc: 94.21%
	Valid Loss: 0.424	Valid Acc: 81.77%


Epoch 17: 100%|██████████| 264/264 [00:04<00:00, 56.07it/s, Loss=0.167, Acc=0.952]


	Train Loss: 0.160	Train Acc: 94.42%
	Valid Loss: 0.434	Valid Acc: 81.77%


Epoch 18: 100%|██████████| 264/264 [00:04<00:00, 53.85it/s, Loss=0.0544, Acc=1]    


	Train Loss: 0.155	Train Acc: 94.60%
	Valid Loss: 0.440	Valid Acc: 81.88%


Epoch 19: 100%|██████████| 264/264 [00:04<00:00, 54.88it/s, Loss=0.224, Acc=0.905] 


	Train Loss: 0.150	Train Acc: 94.82%
	Valid Loss: 0.452	Valid Acc: 81.65%


Epoch 20: 100%|██████████| 264/264 [00:04<00:00, 55.67it/s, Loss=0.108, Acc=0.952] 


	Train Loss: 0.145	Train Acc: 94.94%
	Valid Loss: 0.458	Valid Acc: 81.54%


### Load best model to evaluate

In [32]:
learner.load_model_params(single_head_filename)

In [33]:
learner.print_test_results()

	 Test Loss: 0.397	 Test Acc: 81.93%


# Transformer style attention model
## w/ single attention head & positional encoding

### Load model and learner

In [34]:
single_head_pe = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=1, pos_encode=True, embed_dropout=DROPOUT, attention_dropout=DROPOUT)
learner = get_learner(model=single_head_pe, lr=5e-6)

### Train model

In [35]:
single_head_pe_filename = 'single_head_pe.pt'
learner.train(epochs=20, filename=single_head_pe_filename)

Epoch 01: 100%|██████████| 264/264 [00:05<00:00, 51.59it/s, Loss=0.73, Acc=0.476] 


	Train Loss: 0.684	Train Acc: 55.08%
	Valid Loss: 0.691	Valid Acc: 50.92%
	Model parameters saved to single_head_pe.pt


Epoch 02: 100%|██████████| 264/264 [00:05<00:00, 51.53it/s, Loss=0.6, Acc=0.81]   


	Train Loss: 0.667	Train Acc: 57.65%
	Valid Loss: 0.675	Valid Acc: 51.83%
	Model parameters saved to single_head_pe.pt


Epoch 03: 100%|██████████| 264/264 [00:05<00:00, 51.44it/s, Loss=0.581, Acc=0.762]


	Train Loss: 0.632	Train Acc: 63.98%
	Valid Loss: 0.637	Valid Acc: 62.84%
	Model parameters saved to single_head_pe.pt


Epoch 04: 100%|██████████| 264/264 [00:05<00:00, 51.77it/s, Loss=0.502, Acc=0.81] 


	Train Loss: 0.564	Train Acc: 76.17%
	Valid Loss: 0.578	Valid Acc: 75.69%
	Model parameters saved to single_head_pe.pt


Epoch 05: 100%|██████████| 264/264 [00:05<00:00, 51.33it/s, Loss=0.492, Acc=0.857]


	Train Loss: 0.474	Train Acc: 83.66%
	Valid Loss: 0.519	Valid Acc: 77.87%
	Model parameters saved to single_head_pe.pt


Epoch 06: 100%|██████████| 264/264 [00:05<00:00, 50.91it/s, Loss=0.486, Acc=0.857]


	Train Loss: 0.394	Train Acc: 86.99%
	Valid Loss: 0.474	Valid Acc: 79.01%
	Model parameters saved to single_head_pe.pt


Epoch 07: 100%|██████████| 264/264 [00:05<00:00, 51.78it/s, Loss=0.252, Acc=0.952]


	Train Loss: 0.334	Train Acc: 88.89%
	Valid Loss: 0.444	Valid Acc: 80.05%
	Model parameters saved to single_head_pe.pt


Epoch 08: 100%|██████████| 264/264 [00:05<00:00, 51.18it/s, Loss=0.212, Acc=0.905]


	Train Loss: 0.292	Train Acc: 89.97%
	Valid Loss: 0.425	Valid Acc: 80.50%
	Model parameters saved to single_head_pe.pt


Epoch 09: 100%|██████████| 264/264 [00:05<00:00, 51.34it/s, Loss=0.298, Acc=0.857]


	Train Loss: 0.261	Train Acc: 90.95%
	Valid Loss: 0.414	Valid Acc: 81.54%
	Model parameters saved to single_head_pe.pt


Epoch 10: 100%|██████████| 264/264 [00:05<00:00, 51.55it/s, Loss=0.186, Acc=0.952]


	Train Loss: 0.239	Train Acc: 91.70%
	Valid Loss: 0.408	Valid Acc: 81.54%
	Model parameters saved to single_head_pe.pt


Epoch 11: 100%|██████████| 264/264 [00:05<00:00, 49.89it/s, Loss=0.186, Acc=0.905]


	Train Loss: 0.221	Train Acc: 92.37%
	Valid Loss: 0.406	Valid Acc: 81.77%
	Model parameters saved to single_head_pe.pt


Epoch 12: 100%|██████████| 264/264 [00:05<00:00, 51.72it/s, Loss=0.225, Acc=0.952]


	Train Loss: 0.207	Train Acc: 92.77%
	Valid Loss: 0.405	Valid Acc: 81.31%
	Model parameters saved to single_head_pe.pt


Epoch 13: 100%|██████████| 264/264 [00:05<00:00, 50.38it/s, Loss=0.228, Acc=0.857]


	Train Loss: 0.196	Train Acc: 93.13%
	Valid Loss: 0.408	Valid Acc: 81.77%


Epoch 14: 100%|██████████| 264/264 [00:05<00:00, 52.54it/s, Loss=0.0852, Acc=0.952]


	Train Loss: 0.185	Train Acc: 93.50%
	Valid Loss: 0.412	Valid Acc: 82.11%


Epoch 15: 100%|██████████| 264/264 [00:05<00:00, 52.35it/s, Loss=0.308, Acc=0.857]


	Train Loss: 0.177	Train Acc: 93.76%
	Valid Loss: 0.417	Valid Acc: 82.11%


Epoch 16: 100%|██████████| 264/264 [00:05<00:00, 50.48it/s, Loss=0.043, Acc=1]     


	Train Loss: 0.170	Train Acc: 93.95%
	Valid Loss: 0.424	Valid Acc: 82.22%


Epoch 17: 100%|██████████| 264/264 [00:05<00:00, 49.45it/s, Loss=0.0757, Acc=0.952]


	Train Loss: 0.163	Train Acc: 94.25%
	Valid Loss: 0.431	Valid Acc: 82.11%


Epoch 18: 100%|██████████| 264/264 [00:05<00:00, 51.73it/s, Loss=0.146, Acc=0.952] 


	Train Loss: 0.157	Train Acc: 94.48%
	Valid Loss: 0.435	Valid Acc: 82.22%


Epoch 19: 100%|██████████| 264/264 [00:05<00:00, 51.59it/s, Loss=0.18, Acc=0.952]  


	Train Loss: 0.152	Train Acc: 94.68%
	Valid Loss: 0.445	Valid Acc: 81.77%


Epoch 20: 100%|██████████| 264/264 [00:05<00:00, 51.38it/s, Loss=0.0674, Acc=1]    


	Train Loss: 0.148	Train Acc: 94.80%
	Valid Loss: 0.452	Valid Acc: 81.77%


### Load best model to evaluate

In [36]:
learner.load_model_params(single_head_pe_filename)

In [37]:
learner.print_test_results()

	 Test Loss: 0.396	 Test Acc: 81.99%


# Transformer style attention model
## w/ multiple attention heads

### Load model and learner

In [38]:
multi_head = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=4, embed_dropout=DROPOUT, attention_dropout=DROPOUT)
learner = get_learner(model=multi_head, lr=5e-6)

### Train model

In [39]:
multi_head_filename = 'multi_head.pt'
learner.train(epochs=20, filename=multi_head_filename)

Epoch 01: 100%|██████████| 264/264 [00:05<00:00, 50.74it/s, Loss=0.667, Acc=0.571]


	Train Loss: 0.690	Train Acc: 52.97%
	Valid Loss: 0.685	Valid Acc: 50.92%
	Model parameters saved to multi_head.pt


Epoch 02: 100%|██████████| 264/264 [00:05<00:00, 52.71it/s, Loss=0.551, Acc=0.905]


	Train Loss: 0.657	Train Acc: 59.84%
	Valid Loss: 0.661	Valid Acc: 54.70%
	Model parameters saved to multi_head.pt


Epoch 03: 100%|██████████| 264/264 [00:05<00:00, 51.18it/s, Loss=0.554, Acc=0.714]


	Train Loss: 0.597	Train Acc: 70.45%
	Valid Loss: 0.603	Valid Acc: 69.15%
	Model parameters saved to multi_head.pt


Epoch 04: 100%|██████████| 264/264 [00:05<00:00, 52.15it/s, Loss=0.408, Acc=0.857]


	Train Loss: 0.505	Train Acc: 81.44%
	Valid Loss: 0.538	Valid Acc: 76.38%
	Model parameters saved to multi_head.pt


Epoch 05: 100%|██████████| 264/264 [00:05<00:00, 49.47it/s, Loss=0.338, Acc=0.905]


	Train Loss: 0.416	Train Acc: 86.04%
	Valid Loss: 0.487	Valid Acc: 78.56%
	Model parameters saved to multi_head.pt


Epoch 06: 100%|██████████| 264/264 [00:05<00:00, 50.52it/s, Loss=0.254, Acc=0.905]


	Train Loss: 0.348	Train Acc: 88.60%
	Valid Loss: 0.454	Valid Acc: 79.93%
	Model parameters saved to multi_head.pt


Epoch 07: 100%|██████████| 264/264 [00:05<00:00, 51.52it/s, Loss=0.183, Acc=0.952]


	Train Loss: 0.300	Train Acc: 89.92%
	Valid Loss: 0.432	Valid Acc: 80.62%
	Model parameters saved to multi_head.pt


Epoch 08: 100%|██████████| 264/264 [00:05<00:00, 50.02it/s, Loss=0.223, Acc=1]    


	Train Loss: 0.266	Train Acc: 90.88%
	Valid Loss: 0.420	Valid Acc: 80.73%
	Model parameters saved to multi_head.pt


Epoch 09: 100%|██████████| 264/264 [00:05<00:00, 50.47it/s, Loss=0.281, Acc=0.81] 


	Train Loss: 0.241	Train Acc: 91.69%
	Valid Loss: 0.412	Valid Acc: 80.73%
	Model parameters saved to multi_head.pt


Epoch 10: 100%|██████████| 264/264 [00:05<00:00, 52.25it/s, Loss=0.313, Acc=0.905]


	Train Loss: 0.221	Train Acc: 92.34%
	Valid Loss: 0.409	Valid Acc: 80.96%
	Model parameters saved to multi_head.pt


Epoch 11: 100%|██████████| 264/264 [00:05<00:00, 50.43it/s, Loss=0.323, Acc=0.905]


	Train Loss: 0.206	Train Acc: 92.85%
	Valid Loss: 0.414	Valid Acc: 80.85%


Epoch 12: 100%|██████████| 264/264 [00:05<00:00, 51.42it/s, Loss=0.238, Acc=0.857]


	Train Loss: 0.194	Train Acc: 93.23%
	Valid Loss: 0.413	Valid Acc: 81.08%


Epoch 13: 100%|██████████| 264/264 [00:05<00:00, 49.72it/s, Loss=0.0831, Acc=0.952]


	Train Loss: 0.183	Train Acc: 93.59%
	Valid Loss: 0.421	Valid Acc: 81.08%


Epoch 14: 100%|██████████| 264/264 [00:05<00:00, 47.83it/s, Loss=0.0964, Acc=1]   


	Train Loss: 0.174	Train Acc: 93.89%
	Valid Loss: 0.427	Valid Acc: 81.08%


Epoch 15: 100%|██████████| 264/264 [00:05<00:00, 50.94it/s, Loss=0.0496, Acc=1]   


	Train Loss: 0.167	Train Acc: 94.11%
	Valid Loss: 0.432	Valid Acc: 81.31%


Epoch 16: 100%|██████████| 264/264 [00:05<00:00, 50.37it/s, Loss=0.182, Acc=0.905] 


	Train Loss: 0.160	Train Acc: 94.38%
	Valid Loss: 0.441	Valid Acc: 81.08%


Epoch 17: 100%|██████████| 264/264 [00:05<00:00, 50.25it/s, Loss=0.214, Acc=0.857] 


	Train Loss: 0.155	Train Acc: 94.58%
	Valid Loss: 0.452	Valid Acc: 81.19%


Epoch 18: 100%|██████████| 264/264 [00:05<00:00, 48.84it/s, Loss=0.0586, Acc=1]    


	Train Loss: 0.149	Train Acc: 94.76%
	Valid Loss: 0.459	Valid Acc: 81.31%


Epoch 19: 100%|██████████| 264/264 [00:05<00:00, 50.69it/s, Loss=0.0639, Acc=1]    


	Train Loss: 0.144	Train Acc: 94.95%
	Valid Loss: 0.470	Valid Acc: 81.42%


Epoch 20: 100%|██████████| 264/264 [00:05<00:00, 50.24it/s, Loss=0.261, Acc=0.905] 


	Train Loss: 0.140	Train Acc: 95.12%
	Valid Loss: 0.481	Valid Acc: 81.42%


### Load best model to evaluate

In [40]:
learner.load_model_params(multi_head_filename)

In [41]:
learner.print_test_results()

	 Test Loss: 0.402	 Test Acc: 81.44%


# Transformer style attention model
## w/ multiple attention heads & positional encoding

### Load model and learner

In [42]:
multi_head_pe = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=4, pos_encode=True, embed_dropout=DROPOUT, attention_dropout=DROPOUT)
learner = get_learner(model=multi_head_pe, lr=5e-6)

### Train model

In [43]:
multi_head_pe_filename = 'multi_head_pe.pt'
learner.train(epochs=20, filename=multi_head_pe_filename)

Epoch 01: 100%|██████████| 264/264 [00:05<00:00, 46.97it/s, Loss=0.665, Acc=0.619]


	Train Loss: 0.683	Train Acc: 55.91%
	Valid Loss: 0.691	Valid Acc: 50.92%
	Model parameters saved to multi_head_pe.pt


Epoch 02: 100%|██████████| 264/264 [00:05<00:00, 48.36it/s, Loss=0.599, Acc=0.762]


	Train Loss: 0.665	Train Acc: 58.00%
	Valid Loss: 0.673	Valid Acc: 52.18%
	Model parameters saved to multi_head_pe.pt


Epoch 03: 100%|██████████| 264/264 [00:05<00:00, 49.14it/s, Loss=0.598, Acc=0.667]


	Train Loss: 0.629	Train Acc: 66.01%
	Valid Loss: 0.634	Valid Acc: 64.45%
	Model parameters saved to multi_head_pe.pt


Epoch 04: 100%|██████████| 264/264 [00:05<00:00, 46.60it/s, Loss=0.54, Acc=0.81]  


	Train Loss: 0.559	Train Acc: 77.65%
	Valid Loss: 0.575	Valid Acc: 74.77%
	Model parameters saved to multi_head_pe.pt


Epoch 05: 100%|██████████| 264/264 [00:05<00:00, 46.83it/s, Loss=0.456, Acc=0.762]


	Train Loss: 0.471	Train Acc: 83.99%
	Valid Loss: 0.517	Valid Acc: 78.21%
	Model parameters saved to multi_head_pe.pt


Epoch 06: 100%|██████████| 264/264 [00:05<00:00, 48.29it/s, Loss=0.438, Acc=0.857]


	Train Loss: 0.391	Train Acc: 87.10%
	Valid Loss: 0.472	Valid Acc: 79.82%
	Model parameters saved to multi_head_pe.pt


Epoch 07: 100%|██████████| 264/264 [00:05<00:00, 46.56it/s, Loss=0.281, Acc=0.905]


	Train Loss: 0.331	Train Acc: 88.91%
	Valid Loss: 0.444	Valid Acc: 80.50%
	Model parameters saved to multi_head_pe.pt


Epoch 08: 100%|██████████| 264/264 [00:05<00:00, 47.09it/s, Loss=0.318, Acc=0.905]


	Train Loss: 0.289	Train Acc: 90.20%
	Valid Loss: 0.423	Valid Acc: 81.88%
	Model parameters saved to multi_head_pe.pt


Epoch 09: 100%|██████████| 264/264 [00:05<00:00, 47.39it/s, Loss=0.115, Acc=1]    


	Train Loss: 0.259	Train Acc: 91.07%
	Valid Loss: 0.414	Valid Acc: 81.77%
	Model parameters saved to multi_head_pe.pt


Epoch 10: 100%|██████████| 264/264 [00:05<00:00, 46.85it/s, Loss=0.116, Acc=1]    


	Train Loss: 0.236	Train Acc: 91.79%
	Valid Loss: 0.408	Valid Acc: 82.11%
	Model parameters saved to multi_head_pe.pt


Epoch 11: 100%|██████████| 264/264 [00:05<00:00, 47.88it/s, Loss=0.171, Acc=0.905]


	Train Loss: 0.219	Train Acc: 92.36%
	Valid Loss: 0.406	Valid Acc: 81.88%
	Model parameters saved to multi_head_pe.pt


Epoch 12: 100%|██████████| 264/264 [00:05<00:00, 47.38it/s, Loss=0.174, Acc=1]    


	Train Loss: 0.205	Train Acc: 92.82%
	Valid Loss: 0.406	Valid Acc: 81.65%


Epoch 13: 100%|██████████| 264/264 [00:05<00:00, 47.61it/s, Loss=0.165, Acc=0.952]


	Train Loss: 0.193	Train Acc: 93.31%
	Valid Loss: 0.409	Valid Acc: 82.34%


Epoch 14: 100%|██████████| 264/264 [00:05<00:00, 50.08it/s, Loss=0.0742, Acc=1]   


	Train Loss: 0.183	Train Acc: 93.55%
	Valid Loss: 0.413	Valid Acc: 82.57%


Epoch 15: 100%|██████████| 264/264 [00:05<00:00, 47.55it/s, Loss=0.127, Acc=0.952]


	Train Loss: 0.175	Train Acc: 93.87%
	Valid Loss: 0.417	Valid Acc: 82.11%


Epoch 16: 100%|██████████| 264/264 [00:05<00:00, 47.79it/s, Loss=0.423, Acc=0.857] 


	Train Loss: 0.168	Train Acc: 94.07%
	Valid Loss: 0.425	Valid Acc: 82.45%


Epoch 17: 100%|██████████| 264/264 [00:05<00:00, 47.88it/s, Loss=0.0968, Acc=1]   


	Train Loss: 0.162	Train Acc: 94.32%
	Valid Loss: 0.430	Valid Acc: 82.11%


Epoch 18: 100%|██████████| 264/264 [00:05<00:00, 47.36it/s, Loss=0.127, Acc=0.952] 


	Train Loss: 0.156	Train Acc: 94.54%
	Valid Loss: 0.440	Valid Acc: 82.45%


Epoch 19: 100%|██████████| 264/264 [00:05<00:00, 48.09it/s, Loss=0.108, Acc=0.952] 


	Train Loss: 0.151	Train Acc: 94.69%
	Valid Loss: 0.444	Valid Acc: 82.45%


Epoch 20: 100%|██████████| 264/264 [00:05<00:00, 48.03it/s, Loss=0.0367, Acc=1]    


	Train Loss: 0.147	Train Acc: 94.80%
	Valid Loss: 0.454	Valid Acc: 82.00%


### Load best model to evaluate

In [44]:
learner.load_model_params(multi_head_pe_filename)

In [45]:
learner.print_test_results()

	 Test Loss: 0.398	 Test Acc: 81.60%
