In [1]:
import torch
from torch import nn
from torch.optim import Adam
import pandas as pd

from data import Vocabulary, get_dataloader
from models import WordAveragingModel, AttentionWeightedWordAveragingModel, UAttention, dot_product_self_attention, MultiHeadSelfAttentionModel
from learner import SentimentLearner

torch.manual_seed(41)
loss_fn = nn.BCEWithLogitsLoss()

### Hyperparameters

In [2]:
BATCH_SIZE = 256
EMBED_DIM = 256
EMBED_DROPOUT = 0.25
OPTIM_CLS = Adam
EPOCHS = 15

### Load data

In [3]:
%%time
vocab = Vocabulary()

CPU times: user 31.6 s, sys: 12.1 ms, total: 31.6 s
Wall time: 31.6 s


In [4]:
FILENAME = 'senti.{}.tsv'

train_loader = get_dataloader(FILENAME.format('train'), vocab, batch_size=BATCH_SIZE)
valid_loader = get_dataloader(FILENAME.format('dev'), vocab, batch_size=BATCH_SIZE)
test_loader = get_dataloader(FILENAME.format('test'), vocab, batch_size=BATCH_SIZE)

# Word averaging model

### Load model and learner

In [5]:
word_avg = WordAveragingModel(len(vocab), embed_dim=EMBED_DIM, embed_dropout=EMBED_DROPOUT)

In [6]:
learner = SentimentLearner(
    model=word_avg,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=2.5e-4
)

### Train model

In [7]:
word_avg_filename = 'word_avg.pt'
learner.train(epochs=EPOCHS, filename=word_avg_filename)

Epoch : 01	Wall time : 15.261s
	Train Loss: 0.681 | Train Acc: 56.34%
	Valid Loss: 0.664 | Valid Acc: 55.82%
	Model parameters saved to word_avg.pt
Epoch : 02	Wall time : 15.290s
	Train Loss: 0.639 | Train Acc: 66.15%
	Valid Loss: 0.606 | Valid Acc: 72.76%
	Model parameters saved to word_avg.pt
Epoch : 03	Wall time : 15.306s
	Train Loss: 0.578 | Train Acc: 76.41%
	Valid Loss: 0.544 | Valid Acc: 76.08%
	Model parameters saved to word_avg.pt
Epoch : 04	Wall time : 15.125s
	Train Loss: 0.521 | Train Acc: 80.83%
	Valid Loss: 0.498 | Valid Acc: 77.92%
	Model parameters saved to word_avg.pt
Epoch : 05	Wall time : 15.352s
	Train Loss: 0.470 | Train Acc: 83.91%
	Valid Loss: 0.471 | Valid Acc: 79.28%
	Model parameters saved to word_avg.pt
Epoch : 06	Wall time : 15.476s
	Train Loss: 0.427 | Train Acc: 86.16%
	Valid Loss: 0.450 | Valid Acc: 79.54%
	Model parameters saved to word_avg.pt
Epoch : 07	Wall time : 15.322s
	Train Loss: 0.393 | Train Acc: 87.50%
	Valid Loss: 0.435 | Valid Acc: 81.27%
	Mo

### Load best model to evaluate

In [8]:
learner.load_model_params(word_avg_filename)

In [9]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.404 | Test Acc: 82.66%


## Norm of word embeddings

In [10]:
word_embedding = learner.model.word_embedding
norms = pd.Series(torch.linalg.norm(word_embedding, dim=1).cpu(), index=vocab.itos).sort_values()

In [11]:
norms.tail(15)

too           5.555303
touching      5.571635
stupid        5.581569
enjoyable     5.595304
dull          5.607146
terrific      5.646525
remarkable    5.671368
hilarious     5.692608
best          5.705451
flat          5.868037
powerful      5.908876
solid         5.989082
mess          6.032230
bad           6.716216
worst         7.029595
dtype: float32

In [12]:
norms.head(15)

<pad>              0.000000
<unk>              0.017731
play-doh           0.099339
mikes              0.100724
boom               0.101232
conjured           0.101493
liman              0.102587
flck               0.104340
helpful            0.105775
happily-ever       0.106357
schnieder          0.107072
the                0.108152
passages           0.109647
naipaul            0.109693
post-production    0.109976
dtype: float32

# Attention weighted word averaging model
## w/ cosine similarity attention

### Load model and learner

In [13]:
cos_att = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=UAttention(EMBED_DIM), embed_dropout=EMBED_DROPOUT)

In [14]:
learner = SentimentLearner(
    model=cos_att,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=2.5e-4
)

### Train model

In [15]:
cos_att_filename = 'cos_att.pt'
learner.train(epochs=EPOCHS, filename=cos_att_filename)

Epoch : 01	Wall time : 15.795s
	Train Loss: 0.681 | Train Acc: 56.72%
	Valid Loss: 0.662 | Valid Acc: 56.66%
	Model parameters saved to cos_att.pt
Epoch : 02	Wall time : 15.648s
	Train Loss: 0.634 | Train Acc: 69.50%
	Valid Loss: 0.589 | Valid Acc: 75.53%
	Model parameters saved to cos_att.pt
Epoch : 03	Wall time : 15.721s
	Train Loss: 0.569 | Train Acc: 79.09%
	Valid Loss: 0.535 | Valid Acc: 74.94%
	Model parameters saved to cos_att.pt
Epoch : 04	Wall time : 15.506s
	Train Loss: 0.507 | Train Acc: 82.84%
	Valid Loss: 0.477 | Valid Acc: 77.91%
	Model parameters saved to cos_att.pt
Epoch : 05	Wall time : 15.657s
	Train Loss: 0.455 | Train Acc: 85.34%
	Valid Loss: 0.458 | Valid Acc: 78.31%
	Model parameters saved to cos_att.pt
Epoch : 06	Wall time : 15.495s
	Train Loss: 0.412 | Train Acc: 87.13%
	Valid Loss: 0.417 | Valid Acc: 81.05%
	Model parameters saved to cos_att.pt
Epoch : 07	Wall time : 15.816s
	Train Loss: 0.378 | Train Acc: 88.28%
	Valid Loss: 0.418 | Valid Acc: 80.88%

Epoch : 

### Load best model to evaluate

In [16]:
learner.load_model_params(cos_att_filename)

In [17]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.391 | Test Acc: 83.71%


## Cosine similarities between vector u and word embeddings

In [18]:
att_layer = learner.model.attention
embeddings = learner.model.embedding.weight.data

cosine_similarities = pd.Series(att_layer.cosine_similarity_to_u(embeddings).detach().cpu(), index=vocab.itos).sort_values()

In [19]:
cosine_similarities.tail(15)

buñuel           0.909628
irrelevancy      0.912352
seventeen        0.912658
batman           0.916255
litmus           0.917946
single-minded    0.922519
semen            0.929406
spider           0.936180
substances       0.943009
reduced          0.943857
fluffy           0.945651
detract          0.956421
disappoint       0.957859
ignorant         0.968111
overrun          0.973900
dtype: float32

In [20]:
cosine_similarities.head(15)

's          -0.997821
his         -0.996826
the         -0.996709
abroad      -0.996542
maintain    -0.995989
can         -0.994987
earnhart    -0.994704
across      -0.994373
attempted   -0.994143
change      -0.993333
feel        -0.993191
readily     -0.993008
-           -0.993005
its         -0.992905
mores       -0.992865
dtype: float32

## Attention variance among frequent words in the training set

In [21]:
import bisect
from collections import defaultdict

MAX_FREQ = 100
upper_bound = len(vocab) - bisect.bisect_right(vocab.freqs[::-1], MAX_FREQ)

@torch.no_grad()
def get_attention_stats(learner):
    attentions = defaultdict(list)
    
    embedding_layer = learner.model.embedding
    attention_layer = learner.model.attention
    for batch in train_loader:
        sequences, _ = batch
        sequences = sequences.to(learner.device)
        
        mask = torch.where(sequences < upper_bound, sequences, 0).bool()
        attention = attention_layer(embedding_layer(sequences))
        masked_sequences = torch.masked_select(sequences, mask).tolist()
        masked_attention = torch.masked_select(attention, mask).tolist()
        for i, att in zip(masked_sequences, masked_attention):
            attentions[i].append(att)
        
    return attentions

In [22]:
%%time
stats = get_attention_stats(learner)

CPU times: user 14.4 s, sys: 84.5 ms, total: 14.5 s
Wall time: 14.5 s


In [23]:
df = pd.DataFrame(columns=['word', 'mean', 'std'])
for k, v in stats.items():
    attentions = torch.Tensor(v)
    df = df.append({'word': vocab.itos[k], 'mean': attentions.mean().item(), 'std': attentions.std().item()}, ignore_index=True)

In [24]:
df = df.sort_values('std', ascending=False)
df = df.set_index('word', drop=True)
df.head(30)

Unnamed: 0_level_0,mean,std
word,Unnamed: 1_level_1,Unnamed: 2_level_1
barely,0.049742,0.008348
instead,0.049808,0.008089
cheap,0.048184,0.008065
waste,0.050206,0.007693
less,0.050108,0.007332
clichés,0.049925,0.007235
feels,0.046616,0.00718
neither,0.050422,0.007165
awful,0.05013,0.007164
impossible,0.045538,0.007145


# Attention weighted word averaging model
## w/ dot product self-attention

### Load model and learner

In [25]:
dp_att = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=dot_product_self_attention, embed_dropout=EMBED_DROPOUT)

In [26]:
learner = SentimentLearner(
    model=dp_att,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-5
)

### Train model

In [27]:
dp_att_filename = 'dp_att.pt'
learner.train(epochs=EPOCHS, filename=dp_att_filename)

Epoch : 01	Wall time : 15.595s
	Train Loss: 0.690 | Train Acc: 55.77%
	Valid Loss: 0.691 | Valid Acc: 50.64%
	Model parameters saved to dp_att.pt
Epoch : 02	Wall time : 15.748s
	Train Loss: 0.684 | Train Acc: 55.90%
	Valid Loss: 0.674 | Valid Acc: 51.80%
	Model parameters saved to dp_att.pt
Epoch : 03	Wall time : 15.556s
	Train Loss: 0.659 | Train Acc: 60.23%
	Valid Loss: 0.627 | Valid Acc: 66.14%
	Model parameters saved to dp_att.pt
Epoch : 04	Wall time : 15.628s
	Train Loss: 0.615 | Train Acc: 67.93%
	Valid Loss: 0.567 | Valid Acc: 72.80%
	Model parameters saved to dp_att.pt
Epoch : 05	Wall time : 15.665s
	Train Loss: 0.575 | Train Acc: 71.70%
	Valid Loss: 0.543 | Valid Acc: 73.74%
	Model parameters saved to dp_att.pt
Epoch : 06	Wall time : 15.546s
	Train Loss: 0.544 | Train Acc: 73.85%
	Valid Loss: 0.537 | Valid Acc: 74.28%
	Model parameters saved to dp_att.pt
Epoch : 07	Wall time : 15.673s
	Train Loss: 0.516 | Train Acc: 75.88%
	Valid Loss: 0.507 | Valid Acc: 75.38%
	Model paramete

### Load best model to evaluate

In [28]:
learner.load_model_params(dp_att_filename)

In [29]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.490 | Test Acc: 77.97%


# Attention weighted word averaging model
## w/ dot product self-attention
## adding residual connection

### Load model and learner

In [30]:
dp_att_rc = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=dot_product_self_attention, res_conn=True, embed_dropout=EMBED_DROPOUT)

In [31]:
learner = SentimentLearner(
    model=dp_att_rc,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-5
)

### Train model

In [32]:
dp_att_rc_filename = 'dp_att_rc.pt'
learner.train(epochs=EPOCHS, filename=dp_att_rc_filename)

Epoch : 01	Wall time : 15.544s
	Train Loss: 0.691 | Train Acc: 55.74%
	Valid Loss: 0.687 | Valid Acc: 51.88%
	Model parameters saved to dp_att_rc.pt
Epoch : 02	Wall time : 15.694s
	Train Loss: 0.683 | Train Acc: 57.73%
	Valid Loss: 0.674 | Valid Acc: 53.25%
	Model parameters saved to dp_att_rc.pt
Epoch : 03	Wall time : 15.608s
	Train Loss: 0.660 | Train Acc: 63.53%
	Valid Loss: 0.629 | Valid Acc: 66.52%
	Model parameters saved to dp_att_rc.pt
Epoch : 04	Wall time : 15.586s
	Train Loss: 0.619 | Train Acc: 70.72%
	Valid Loss: 0.577 | Valid Acc: 73.97%
	Model parameters saved to dp_att_rc.pt
Epoch : 05	Wall time : 15.907s
	Train Loss: 0.576 | Train Acc: 74.61%
	Valid Loss: 0.542 | Valid Acc: 74.71%
	Model parameters saved to dp_att_rc.pt
Epoch : 06	Wall time : 15.737s
	Train Loss: 0.540 | Train Acc: 76.72%
	Valid Loss: 0.514 | Valid Acc: 75.92%
	Model parameters saved to dp_att_rc.pt
Epoch : 07	Wall time : 15.653s
	Train Loss: 0.510 | Train Acc: 78.43%
	Valid Loss: 0.511 | Valid Acc: 75.6

### Load best model to evaluate

In [33]:
learner.load_model_params(dp_att_rc_filename)

In [34]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.445 | Test Acc: 81.37%


# Transformer style attention model
## w/ single attention head

### Load model and learner

In [35]:
single_head = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=1, embed_dropout=EMBED_DROPOUT)

In [36]:
learner = SentimentLearner(
    model=single_head,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-6
)

### Train model

In [37]:
single_head_filename = 'single_head.pt'
learner.train(epochs=EPOCHS, filename=single_head_filename)

Epoch : 01	Wall time : 17.272s
	Train Loss: 0.704 | Train Acc: 48.00%
	Valid Loss: 0.689 | Valid Acc: 50.07%
	Model parameters saved to single_head.pt
Epoch : 02	Wall time : 17.317s
	Train Loss: 0.679 | Train Acc: 55.84%
	Valid Loss: 0.678 | Valid Acc: 51.35%
	Model parameters saved to single_head.pt
Epoch : 03	Wall time : 17.184s
	Train Loss: 0.659 | Train Acc: 57.69%
	Valid Loss: 0.644 | Valid Acc: 60.46%
	Model parameters saved to single_head.pt
Epoch : 04	Wall time : 17.317s
	Train Loss: 0.619 | Train Acc: 67.02%
	Valid Loss: 0.591 | Valid Acc: 73.74%
	Model parameters saved to single_head.pt
Epoch : 05	Wall time : 17.178s
	Train Loss: 0.565 | Train Acc: 74.97%
	Valid Loss: 0.537 | Valid Acc: 77.32%
	Model parameters saved to single_head.pt
Epoch : 06	Wall time : 17.166s
	Train Loss: 0.508 | Train Acc: 79.69%
	Valid Loss: 0.493 | Valid Acc: 77.14%
	Model parameters saved to single_head.pt
Epoch : 07	Wall time : 17.062s
	Train Loss: 0.454 | Train Acc: 83.10%
	Valid Loss: 0.452 | Val

### Load best model to evaluate

In [38]:
learner.load_model_params(single_head_filename)

In [39]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.415 | Test Acc: 81.46%


# Transformer style attention model
## w/ single attention head & positional encoding

### Load model and learner

In [40]:
single_head_pe = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=1, pos_encode=True, embed_dropout=EMBED_DROPOUT)

In [41]:
learner = SentimentLearner(
    model=single_head_pe,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=1e-5
)

### Train model

In [42]:
single_head_pe_filename = 'single_head_pe.pt'
learner.train(epochs=EPOCHS, filename=single_head_pe_filename)

Epoch : 01	Wall time : 17.306s
	Train Loss: 0.690 | Train Acc: 54.63%
	Valid Loss: 0.696 | Valid Acc: 50.64%
	Model parameters saved to single_head_pe.pt
Epoch : 02	Wall time : 17.248s
	Train Loss: 0.687 | Train Acc: 55.83%
	Valid Loss: 0.704 | Valid Acc: 50.07%

Epoch : 03	Wall time : 17.111s
	Train Loss: 0.686 | Train Acc: 55.85%
	Valid Loss: 0.700 | Valid Acc: 51.21%

Epoch : 04	Wall time : 17.322s
	Train Loss: 0.686 | Train Acc: 55.80%
	Valid Loss: 0.699 | Valid Acc: 49.78%

Epoch : 05	Wall time : 17.249s
	Train Loss: 0.686 | Train Acc: 55.79%
	Valid Loss: 0.696 | Valid Acc: 51.07%
	Model parameters saved to single_head_pe.pt
Epoch : 06	Wall time : 17.136s
	Train Loss: 0.685 | Train Acc: 55.77%
	Valid Loss: 0.697 | Valid Acc: 50.92%

Epoch : 07	Wall time : 17.169s
	Train Loss: 0.685 | Train Acc: 55.71%
	Valid Loss: 0.696 | Valid Acc: 51.49%
	Model parameters saved to single_head_pe.pt
Epoch : 08	Wall time : 17.048s
	Train Loss: 0.684 | Train Acc: 55.81%
	Valid Loss: 0.695 | Valid A

### Load best model to evaluate

In [43]:
learner.load_model_params(single_head_pe_filename)

In [44]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.624 | Test Acc: 66.83%


# Transformer style attention model
## w/ multiple attention heads

### Load model and learner

In [45]:
multi_head = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=4, embed_dropout=EMBED_DROPOUT)

In [46]:
learner = SentimentLearner(
    model=multi_head,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-6
)

### Train model

In [47]:
multi_head_filename = 'multi_head.pt'
learner.train(epochs=EPOCHS, filename=multi_head_filename)

Epoch : 01	Wall time : 17.186s
	Train Loss: 0.779 | Train Acc: 44.18%
	Valid Loss: 0.704 | Valid Acc: 48.93%
	Model parameters saved to multi_head.pt
Epoch : 02	Wall time : 17.107s
	Train Loss: 0.700 | Train Acc: 48.29%
	Valid Loss: 0.680 | Valid Acc: 57.91%
	Model parameters saved to multi_head.pt
Epoch : 03	Wall time : 17.198s
	Train Loss: 0.670 | Train Acc: 60.78%
	Valid Loss: 0.659 | Valid Acc: 54.55%
	Model parameters saved to multi_head.pt
Epoch : 04	Wall time : 17.185s
	Train Loss: 0.639 | Train Acc: 64.80%
	Valid Loss: 0.621 | Valid Acc: 64.51%
	Model parameters saved to multi_head.pt
Epoch : 05	Wall time : 17.163s
	Train Loss: 0.590 | Train Acc: 75.80%
	Valid Loss: 0.562 | Valid Acc: 76.14%
	Model parameters saved to multi_head.pt
Epoch : 06	Wall time : 17.064s
	Train Loss: 0.537 | Train Acc: 80.22%
	Valid Loss: 0.515 | Valid Acc: 77.22%
	Model parameters saved to multi_head.pt
Epoch : 07	Wall time : 17.066s
	Train Loss: 0.488 | Train Acc: 82.88%
	Valid Loss: 0.491 | Valid Acc

### Load best model to evaluate

In [48]:
learner.load_model_params(multi_head_filename)

In [49]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.395 | Test Acc: 83.52%
