In [1]:
import torch
from torch import nn
from torch.optim import Adam
import pandas as pd

from data import Vocabulary, SentiDataset, get_dataloader
from models import WordAveragingModel, AttentionWeightedWordAveragingModel, UAttention, dot_product_self_attention, MultiHeadSelfAttentionModel
from learner import SentimentLearner

torch.manual_seed(41)
loss_fn = nn.BCEWithLogitsLoss()

### Hyperparameters

In [2]:
BATCH_SIZE = 256
EMBED_DIM = 256
EMBED_DROPOUT = 0.25
OPTIM_CLS = Adam

### Load data

In [3]:
%%time
vocab = Vocabulary()

CPU times: user 34.5 s, sys: 3.31 ms, total: 34.6 s
Wall time: 34.6 s


In [4]:
FILENAME = 'senti.{}.tsv'

train_set = SentiDataset(FILENAME.format('train'), vocab)
valid_set = SentiDataset(FILENAME.format('dev'), vocab)
test_set = SentiDataset(FILENAME.format('test'), vocab)

train_loader = get_dataloader(train_set, batch_size=BATCH_SIZE)
valid_loader = get_dataloader(valid_set, batch_size=BATCH_SIZE)
test_loader = get_dataloader(test_set, batch_size=BATCH_SIZE)

# Word averaging model

### Load model and learner

In [5]:
word_avg = WordAveragingModel(len(vocab), embed_dim=EMBED_DIM, embed_dropout=EMBED_DROPOUT)

In [6]:
learner = SentimentLearner(
    model=word_avg,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=2.5e-4
)

### Train model

In [7]:
word_avg_filename = 'word_avg.pt'
learner.train(epochs=15, filename=word_avg_filename)

Epoch : 01	Wall time : 20.357s
	Train Loss: 0.681 | Train Acc: 56.31%
	Valid Loss: 0.665 | Valid Acc: 55.16%
	Model parameters saved to word_avg.pt
Epoch : 02	Wall time : 20.557s
	Train Loss: 0.639 | Train Acc: 66.08%
	Valid Loss: 0.606 | Valid Acc: 72.71%
	Model parameters saved to word_avg.pt
Epoch : 03	Wall time : 20.140s
	Train Loss: 0.579 | Train Acc: 76.41%
	Valid Loss: 0.546 | Valid Acc: 76.49%
	Model parameters saved to word_avg.pt
Epoch : 04	Wall time : 20.379s
	Train Loss: 0.521 | Train Acc: 80.80%
	Valid Loss: 0.503 | Valid Acc: 77.98%
	Model parameters saved to word_avg.pt
Epoch : 05	Wall time : 20.117s
	Train Loss: 0.470 | Train Acc: 83.92%
	Valid Loss: 0.471 | Valid Acc: 79.24%
	Model parameters saved to word_avg.pt
Epoch : 06	Wall time : 20.334s
	Train Loss: 0.428 | Train Acc: 86.10%
	Valid Loss: 0.449 | Valid Acc: 80.16%
	Model parameters saved to word_avg.pt
Epoch : 07	Wall time : 20.493s
	Train Loss: 0.394 | Train Acc: 87.53%
	Valid Loss: 0.435 | Valid Acc: 80.96%
	Mo

### Load best model to evaluate

In [8]:
learner.load_model_params(word_avg_filename)

In [9]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.405 | Test Acc: 82.70%


## Norm of word embeddings

In [10]:
word_embedding = learner.model.word_embedding
norms = pd.Series(torch.linalg.norm(word_embedding, dim=1).cpu(), index=vocab.itos).sort_values()

In [11]:
norms.tail(15)

too           5.560832
touching      5.573279
enjoyable     5.582615
stupid        5.585888
dull          5.610507
terrific      5.646167
remarkable    5.674034
hilarious     5.688003
best          5.706871
flat          5.862432
powerful      5.901927
solid         5.991998
mess          6.037328
bad           6.715269
worst         7.032340
dtype: float32

In [12]:
norms.head(15)

<pad>       0.000000
<unk>       0.017731
mikes       0.098789
boom        0.102556
the         0.105607
conjured    0.106184
liman       0.106737
helpful     0.106941
nubile      0.108074
naipaul     0.108272
examine     0.108533
play-doh    0.109337
mothman     0.110031
amassed     0.110183
flesh       0.110927
dtype: float32

# Attention weighted word averaging model
## w/ cosine similarity attention

### Load model and learner

In [13]:
cos_att = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=UAttention(EMBED_DIM), embed_dropout=EMBED_DROPOUT)

In [14]:
learner = SentimentLearner(
    model=cos_att,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=2.5e-4
)

### Train model

In [15]:
cos_att_filename = 'cos_att.pt'
learner.train(epochs=15, filename=cos_att_filename)

Epoch : 01	Wall time : 21.103s
	Train Loss: 0.681 | Train Acc: 56.69%
	Valid Loss: 0.662 | Valid Acc: 56.42%
	Model parameters saved to cos_att.pt
Epoch : 02	Wall time : 21.193s
	Train Loss: 0.634 | Train Acc: 69.38%
	Valid Loss: 0.589 | Valid Acc: 75.34%
	Model parameters saved to cos_att.pt
Epoch : 03	Wall time : 21.499s
	Train Loss: 0.570 | Train Acc: 78.95%
	Valid Loss: 0.528 | Valid Acc: 76.49%
	Model parameters saved to cos_att.pt
Epoch : 04	Wall time : 21.535s
	Train Loss: 0.508 | Train Acc: 82.79%
	Valid Loss: 0.482 | Valid Acc: 77.52%
	Model parameters saved to cos_att.pt
Epoch : 05	Wall time : 21.308s
	Train Loss: 0.455 | Train Acc: 85.31%
	Valid Loss: 0.452 | Valid Acc: 78.56%
	Model parameters saved to cos_att.pt
Epoch : 06	Wall time : 21.191s
	Train Loss: 0.413 | Train Acc: 87.06%
	Valid Loss: 0.432 | Valid Acc: 79.82%
	Model parameters saved to cos_att.pt
Epoch : 07	Wall time : 21.425s
	Train Loss: 0.378 | Train Acc: 88.26%
	Valid Loss: 0.426 | Valid Acc: 80.62%
	Model pa

### Load best model to evaluate

In [16]:
learner.load_model_params(cos_att_filename)

In [17]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.408 | Test Acc: 82.59%


## Cosine similarities between vector u and word embeddings

In [18]:
att_layer = learner.model.attention
embeddings = learner.model.embedding.weight.data

cosine_similarities = pd.Series(att_layer.cosine_similarity_to_u(embeddings).detach().cpu(), index=vocab.itos).sort_values()

In [19]:
cosine_similarities.tail(15)

buñuel           0.906117
batman           0.916345
single-minded    0.920338
framing          0.920856
litmus           0.921431
spider           0.922717
semen            0.926541
seventeen        0.929638
substances       0.939480
reduced          0.947320
detract          0.950214
fluffy           0.951300
ignorant         0.958962
disappoint       0.959875
overrun          0.971206
dtype: float32

In [20]:
cosine_similarities.head(15)

's          -0.997794
maintain    -0.997158
abroad      -0.996859
the         -0.996841
his         -0.996395
attempted   -0.994953
can         -0.994933
across      -0.994708
readily     -0.994629
change      -0.993961
earnhart    -0.992934
clubs       -0.992809
hoffman     -0.992687
its         -0.992424
--          -0.992422
dtype: float32

## Attention variance among frequent words in the training set

In [21]:
import bisect
from collections import defaultdict

MAX_FREQ = 100
upper_bound = len(vocab) - bisect.bisect_right(vocab.freqs[::-1], MAX_FREQ)

@torch.no_grad()
def get_attention_stats(learner):
    attentions = defaultdict(list)
    
    embedding_layer = learner.model.embedding
    attention_layer = learner.model.attention
    for batch in train_loader:
        sequences, _ = batch
        sequences = sequences.to(learner.device)
        
        mask = torch.where(sequences < upper_bound, sequences, 0).bool()
        attention = attention_layer(embedding_layer(sequences))
        masked_sequences = torch.masked_select(sequences, mask).tolist()
        masked_attention = torch.masked_select(attention, mask).tolist()
        for i, att in zip(masked_sequences, masked_attention):
            attentions[i].append(att)
        
    return attentions

In [22]:
%%time
stats = get_attention_stats(learner)

CPU times: user 15.2 s, sys: 32 ms, total: 15.3 s
Wall time: 15.2 s


In [23]:
df = pd.DataFrame(columns=['word', 'mean', 'std'])
for k, v in stats.items():
    attentions = torch.Tensor(v)
    df = df.append({'word': vocab.itos[k], 'mean': attentions.mean().item(), 'std': attentions.std().item()}, ignore_index=True)

In [24]:
df = df.sort_values('std', ascending=False)
df = df.set_index('word', drop=True)
df.head(30)

Unnamed: 0_level_0,mean,std
word,Unnamed: 1_level_1,Unnamed: 2_level_1
barely,0.049382,0.008265
instead,0.049393,0.008019
cheap,0.047593,0.007967
waste,0.0498,0.007646
less,0.04964,0.00728
clichés,0.049397,0.007139
left,0.047434,0.007084
feels,0.046004,0.007079
awful,0.049558,0.007071
neither,0.049954,0.007061


# Attention weighted word averaging model
## w/ dot product self-attention

### Load model and learner

In [25]:
dp_att = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=dot_product_self_attention, embed_dropout=EMBED_DROPOUT)

In [26]:
learner = SentimentLearner(
    model=dp_att,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-5
)

### Train model

In [27]:
dp_att_filename = 'dp_att.pt'
learner.train(epochs=15, filename=dp_att_filename)

Epoch : 01	Wall time : 19.229s
	Train Loss: 0.690 | Train Acc: 55.78%
	Valid Loss: 0.690 | Valid Acc: 50.92%
	Model parameters saved to dp_att.pt
Epoch : 02	Wall time : 19.918s
	Train Loss: 0.684 | Train Acc: 55.93%
	Valid Loss: 0.673 | Valid Acc: 51.95%
	Model parameters saved to dp_att.pt
Epoch : 03	Wall time : 19.813s
	Train Loss: 0.659 | Train Acc: 60.19%
	Valid Loss: 0.626 | Valid Acc: 65.60%
	Model parameters saved to dp_att.pt
Epoch : 04	Wall time : 20.113s
	Train Loss: 0.615 | Train Acc: 67.99%
	Valid Loss: 0.573 | Valid Acc: 72.25%
	Model parameters saved to dp_att.pt
Epoch : 05	Wall time : 20.303s
	Train Loss: 0.575 | Train Acc: 71.66%
	Valid Loss: 0.543 | Valid Acc: 73.85%
	Model parameters saved to dp_att.pt
Epoch : 06	Wall time : 20.363s
	Train Loss: 0.543 | Train Acc: 73.95%
	Valid Loss: 0.525 | Valid Acc: 74.66%
	Model parameters saved to dp_att.pt
Epoch : 07	Wall time : 20.750s
	Train Loss: 0.516 | Train Acc: 75.85%
	Valid Loss: 0.513 | Valid Acc: 75.11%
	Model paramete

### Load best model to evaluate

In [28]:
learner.load_model_params(dp_att_filename)

In [29]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.484 | Test Acc: 77.70%


# Attention weighted word averaging model
## w/ dot product self-attention
## adding residual connection

### Load model and learner

In [30]:
dp_att_rc = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, attention=dot_product_self_attention, res_conn=True, embed_dropout=EMBED_DROPOUT)

In [31]:
learner = SentimentLearner(
    model=dp_att_rc,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-5
)

### Train model

In [32]:
dp_att_rc_filename = 'dp_att_rc.pt'
learner.train(epochs=15, filename=dp_att_rc_filename)

Epoch : 01	Wall time : 20.527s
	Train Loss: 0.691 | Train Acc: 55.82%
	Valid Loss: 0.688 | Valid Acc: 50.92%
	Model parameters saved to dp_att_rc.pt
Epoch : 02	Wall time : 20.310s
	Train Loss: 0.683 | Train Acc: 57.68%
	Valid Loss: 0.671 | Valid Acc: 54.82%
	Model parameters saved to dp_att_rc.pt
Epoch : 03	Wall time : 20.543s
	Train Loss: 0.660 | Train Acc: 63.50%
	Valid Loss: 0.631 | Valid Acc: 65.83%
	Model parameters saved to dp_att_rc.pt
Epoch : 04	Wall time : 20.661s
	Train Loss: 0.619 | Train Acc: 70.64%
	Valid Loss: 0.577 | Valid Acc: 73.74%
	Model parameters saved to dp_att_rc.pt
Epoch : 05	Wall time : 20.043s
	Train Loss: 0.576 | Train Acc: 74.57%
	Valid Loss: 0.540 | Valid Acc: 74.66%
	Model parameters saved to dp_att_rc.pt
Epoch : 06	Wall time : 20.526s
	Train Loss: 0.540 | Train Acc: 76.64%
	Valid Loss: 0.520 | Valid Acc: 75.34%
	Model parameters saved to dp_att_rc.pt
Epoch : 07	Wall time : 20.515s
	Train Loss: 0.510 | Train Acc: 78.43%
	Valid Loss: 0.507 | Valid Acc: 75.6

### Load best model to evaluate

In [33]:
learner.load_model_params(dp_att_rc_filename)

In [34]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.479 | Test Acc: 78.03%


# Transformer style attention model
## w/ single attention head

### Load model and learner

In [35]:
single_head = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=1, embed_dropout=EMBED_DROPOUT)

In [36]:
learner = SentimentLearner(
    model=single_head,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-6
)

### Train model

In [37]:
single_head_filename = 'single_head.pt'
learner.train(epochs=20, filename=single_head_filename)

Epoch : 01	Wall time : 23.899s
	Train Loss: 0.704 | Train Acc: 48.00%
	Valid Loss: 0.688 | Valid Acc: 50.92%
	Model parameters saved to single_head.pt
Epoch : 02	Wall time : 23.832s
	Train Loss: 0.679 | Train Acc: 55.79%
	Valid Loss: 0.679 | Valid Acc: 50.92%
	Model parameters saved to single_head.pt
Epoch : 03	Wall time : 23.233s
	Train Loss: 0.659 | Train Acc: 57.61%
	Valid Loss: 0.644 | Valid Acc: 60.44%
	Model parameters saved to single_head.pt
Epoch : 04	Wall time : 23.588s
	Train Loss: 0.619 | Train Acc: 66.96%
	Valid Loss: 0.592 | Valid Acc: 73.85%
	Model parameters saved to single_head.pt
Epoch : 05	Wall time : 24.076s
	Train Loss: 0.565 | Train Acc: 74.95%
	Valid Loss: 0.539 | Valid Acc: 76.72%
	Model parameters saved to single_head.pt
Epoch : 06	Wall time : 23.817s
	Train Loss: 0.508 | Train Acc: 79.64%
	Valid Loss: 0.492 | Valid Acc: 77.18%
	Model parameters saved to single_head.pt
Epoch : 07	Wall time : 23.812s
	Train Loss: 0.454 | Train Acc: 83.10%
	Valid Loss: 0.461 | Val

### Load best model to evaluate

In [38]:
learner.load_model_params(single_head_filename)

In [39]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.407 | Test Acc: 82.43%


# Transformer style attention model
## w/ single attention head & positional encoding

### Load model and learner

In [40]:
single_head_pe = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=1, pos_encode=True, embed_dropout=EMBED_DROPOUT)

In [41]:
learner = SentimentLearner(
    model=single_head_pe,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-6
)

### Train model

In [42]:
single_head_pe_filename = 'single_head_pe.pt'
learner.train(epochs=20, filename=single_head_pe_filename)

Epoch : 01	Wall time : 24.271s
	Train Loss: 0.692 | Train Acc: 51.99%
	Valid Loss: 0.693 | Valid Acc: 50.92%
	Model parameters saved to single_head_pe.pt
Epoch : 02	Wall time : 24.432s
	Train Loss: 0.682 | Train Acc: 55.78%
	Valid Loss: 0.690 | Valid Acc: 50.92%
	Model parameters saved to single_head_pe.pt
Epoch : 03	Wall time : 24.246s
	Train Loss: 0.676 | Train Acc: 55.80%
	Valid Loss: 0.680 | Valid Acc: 51.03%
	Model parameters saved to single_head_pe.pt
Epoch : 04	Wall time : 24.688s
	Train Loss: 0.666 | Train Acc: 56.63%
	Valid Loss: 0.663 | Valid Acc: 55.39%
	Model parameters saved to single_head_pe.pt
Epoch : 05	Wall time : 24.238s
	Train Loss: 0.645 | Train Acc: 61.42%
	Valid Loss: 0.632 | Valid Acc: 67.32%
	Model parameters saved to single_head_pe.pt
Epoch : 06	Wall time : 24.422s
	Train Loss: 0.612 | Train Acc: 67.98%
	Valid Loss: 0.589 | Valid Acc: 72.36%
	Model parameters saved to single_head_pe.pt
Epoch : 07	Wall time : 24.326s
	Train Loss: 0.567 | Train Acc: 73.71%
	Valid

### Load best model to evaluate

In [43]:
learner.load_model_params(single_head_pe_filename)

In [44]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.418 | Test Acc: 82.10%


# Transformer style attention model
## w/ multiple attention heads

### Load model and learner

In [45]:
multi_head = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=4, embed_dropout=EMBED_DROPOUT)

In [46]:
learner = SentimentLearner(
    model=multi_head,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-6
)

### Train model

In [47]:
multi_head_filename = 'multi_head.pt'
learner.train(epochs=20, filename=multi_head_filename)

Epoch : 01	Wall time : 23.690s
	Train Loss: 0.752 | Train Acc: 44.22%
	Valid Loss: 0.695 | Valid Acc: 49.08%
	Model parameters saved to multi_head.pt
Epoch : 02	Wall time : 23.422s
	Train Loss: 0.691 | Train Acc: 53.17%
	Valid Loss: 0.679 | Valid Acc: 51.61%
	Model parameters saved to multi_head.pt
Epoch : 03	Wall time : 23.419s
	Train Loss: 0.666 | Train Acc: 56.72%
	Valid Loss: 0.658 | Valid Acc: 53.44%
	Model parameters saved to multi_head.pt
Epoch : 04	Wall time : 23.237s
	Train Loss: 0.632 | Train Acc: 64.99%
	Valid Loss: 0.607 | Valid Acc: 69.61%
	Model parameters saved to multi_head.pt
Epoch : 05	Wall time : 23.723s
	Train Loss: 0.581 | Train Acc: 75.47%
	Valid Loss: 0.556 | Valid Acc: 75.80%
	Model parameters saved to multi_head.pt
Epoch : 06	Wall time : 23.481s
	Train Loss: 0.528 | Train Acc: 80.23%
	Valid Loss: 0.513 | Valid Acc: 77.29%
	Model parameters saved to multi_head.pt
Epoch : 07	Wall time : 23.563s
	Train Loss: 0.480 | Train Acc: 83.10%
	Valid Loss: 0.480 | Valid Acc

### Load best model to evaluate

In [48]:
learner.load_model_params(multi_head_filename)

In [49]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.405 | Test Acc: 82.98%


# Transformer style attention model
## w/ multiple attention heads & positional encoding

### Load model and learner

In [50]:
multi_head_pe = MultiHeadSelfAttentionModel(len(vocab), model_dim=EMBED_DIM, num_heads=4, pos_encode=True, embed_dropout=EMBED_DROPOUT)

In [51]:
learner = SentimentLearner(
    model=multi_head_pe,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=5e-6
)

### Train model

In [52]:
multi_head_pe_filename = 'multi_head_pe.pt'
learner.train(epochs=20, filename=multi_head_pe_filename)

Epoch : 01	Wall time : 24.191s
	Train Loss: 0.689 | Train Acc: 53.21%
	Valid Loss: 0.694 | Valid Acc: 50.92%
	Model parameters saved to multi_head_pe.pt
Epoch : 02	Wall time : 24.103s
	Train Loss: 0.682 | Train Acc: 55.78%
	Valid Loss: 0.690 | Valid Acc: 50.92%
	Model parameters saved to multi_head_pe.pt
Epoch : 03	Wall time : 24.106s
	Train Loss: 0.677 | Train Acc: 55.80%
	Valid Loss: 0.682 | Valid Acc: 50.92%
	Model parameters saved to multi_head_pe.pt
Epoch : 04	Wall time : 23.999s
	Train Loss: 0.666 | Train Acc: 56.70%
	Valid Loss: 0.661 | Valid Acc: 56.88%
	Model parameters saved to multi_head_pe.pt
Epoch : 05	Wall time : 24.055s
	Train Loss: 0.642 | Train Acc: 62.53%
	Valid Loss: 0.626 | Valid Acc: 69.38%
	Model parameters saved to multi_head_pe.pt
Epoch : 06	Wall time : 24.101s
	Train Loss: 0.603 | Train Acc: 69.32%
	Valid Loss: 0.580 | Valid Acc: 73.74%
	Model parameters saved to multi_head_pe.pt
Epoch : 07	Wall time : 24.119s
	Train Loss: 0.555 | Train Acc: 75.31%
	Valid Loss:

### Load best model to evaluate

In [53]:
learner.load_model_params(multi_head_pe_filename)

In [54]:
test_loss, test_acc = learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.410 | Test Acc: 82.43%
