In [1]:
import torch
from torch import nn
from torch.optim import Adam
import pandas as pd

from data import Vocabulary, get_dataloader
from learner import SentimentLearner

torch.manual_seed(41)
loss_fn = nn.BCEWithLogitsLoss()

## Hyperparameters

In [2]:
BATCH_SIZE = 256
EMBED_DIM = 200
EMBED_DROPOUT = 0.5
OPTIM_CLS = Adam
LR = 5e-4
EPOCHS = 10

## Load data

In [3]:
%%time
vocab = Vocabulary()

CPU times: user 33.3 s, sys: 0 ns, total: 33.3 s
Wall time: 33.4 s


In [4]:
FILENAME = 'senti.{}.tsv'

train_loader = get_dataloader(FILENAME.format('train'), vocab, batch_size=BATCH_SIZE)
valid_loader = get_dataloader(FILENAME.format('dev'), vocab, batch_size=BATCH_SIZE)
test_loader = get_dataloader(FILENAME.format('test'), vocab, batch_size=BATCH_SIZE)

# Word averaging model

## Load model and learner

In [5]:
from models import WordAveragingModel

wam = WordAveragingModel(len(vocab), embed_dim=EMBED_DIM, embed_dropout=EMBED_DROPOUT)

In [6]:
wam_learner = SentimentLearner(
    model=wam,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=LR
)

## Train model

In [7]:
wam_filename = 'word_avg.pt'
wam_learner.train(epochs=EPOCHS, filename=wam_filename)

Epoch: 01  Wall time: 16.012s
	Train Loss: 0.676 | Train Acc: 60.81%
	Valid Loss: 0.634 | Valid Acc: 67.30%
	Model parameters saved to word_avg.pt
Epoch: 02  Wall time: 15.943s
	Train Loss: 0.592 | Train Acc: 77.70%
	Valid Loss: 0.538 | Valid Acc: 75.35%
	Model parameters saved to word_avg.pt
Epoch: 03  Wall time: 16.268s
	Train Loss: 0.499 | Train Acc: 83.31%
	Valid Loss: 0.485 | Valid Acc: 78.13%
	Model parameters saved to word_avg.pt
Epoch: 04  Wall time: 15.757s
	Train Loss: 0.426 | Train Acc: 86.26%
	Valid Loss: 0.446 | Valid Acc: 80.79%
	Model parameters saved to word_avg.pt
Epoch: 05  Wall time: 15.863s
	Train Loss: 0.376 | Train Acc: 88.03%
	Valid Loss: 0.427 | Valid Acc: 81.17%
	Model parameters saved to word_avg.pt
Epoch: 06  Wall time: 16.203s
	Train Loss: 0.338 | Train Acc: 89.19%
	Valid Loss: 0.416 | Valid Acc: 81.33%
	Model parameters saved to word_avg.pt
Epoch: 07  Wall time: 16.304s
	Train Loss: 0.308 | Train Acc: 90.00%
	Valid Loss: 0.411 | Valid Acc: 82.48%
	Model par

## Load best model to evaluate

In [8]:
wam_learner.load_model_params(wam_filename)

In [9]:
test_loss, test_acc = wam_learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.436 | Test Acc: 82.81%


## Norm of word embeddings

In [10]:
norms = pd.Series(torch.linalg.norm(wam_learner.model.word_embedding, dim=1), index=vocab.itos).sort_values()

In [11]:
norms.tail(15)

beautifully    5.453632
best           5.456155
dull           5.578378
touching       5.626331
stupid         5.667773
terrific       5.712662
flat           5.726692
remarkable     5.743237
enjoyable      5.775650
hilarious      5.823424
powerful       5.900732
solid          5.944018
mess           6.177738
bad            6.579552
worst          7.167939
dtype: float32

In [12]:
norms.head(15)

<pad>        0.000000
<unk>        0.020988
alfonso      0.161497
strategy     0.173768
jeong        0.176887
jae-eun      0.177959
2/3          0.178415
mikes        0.178877
liman        0.180664
boom         0.182392
summary      0.186733
opts         0.187388
xiaoshuai    0.187785
clubs        0.188298
plate        0.190157
dtype: float32

# Attention weighted word averaging model

## Load model and learner

In [13]:
from models import AttentionWeightedWordAveragingModel

awwam = AttentionWeightedWordAveragingModel(len(vocab), embed_dim=EMBED_DIM, embed_dropout=EMBED_DROPOUT)

In [14]:
awwam_learner = SentimentLearner(
    model=awwam,
    train_loader=train_loader,
    valid_loader=valid_loader,
    loss_fn=loss_fn,
    optim_cls=OPTIM_CLS,
    lr=LR
)

## Train model

In [15]:
awwam_filename = 'atten_weighted_word_avg.pt'
awwam_learner.train(epochs=EPOCHS, filename=awwam_filename)

Epoch: 01  Wall time: 16.366s
	Train Loss: 0.671 | Train Acc: 59.74%
	Valid Loss: 0.616 | Valid Acc: 71.09%
	Model parameters saved to atten_weighted_word_avg.pt
Epoch: 02  Wall time: 16.262s
	Train Loss: 0.580 | Train Acc: 76.86%
	Valid Loss: 0.508 | Valid Acc: 76.98%
	Model parameters saved to atten_weighted_word_avg.pt
Epoch: 03  Wall time: 16.399s
	Train Loss: 0.483 | Train Acc: 83.03%
	Valid Loss: 0.463 | Valid Acc: 79.00%
	Model parameters saved to atten_weighted_word_avg.pt
Epoch: 04  Wall time: 16.476s
	Train Loss: 0.413 | Train Acc: 86.27%
	Valid Loss: 0.422 | Valid Acc: 81.99%
	Model parameters saved to atten_weighted_word_avg.pt
Epoch: 05  Wall time: 16.386s
	Train Loss: 0.363 | Train Acc: 88.15%
	Valid Loss: 0.434 | Valid Acc: 80.45%

Epoch: 06  Wall time: 16.481s
	Train Loss: 0.326 | Train Acc: 89.53%
	Valid Loss: 0.446 | Valid Acc: 81.61%

Epoch: 07  Wall time: 16.248s
	Train Loss: 0.296 | Train Acc: 90.36%
	Valid Loss: 0.437 | Valid Acc: 81.72%

Epoch: 08  Wall time: 16.

## Load best model to evaluate

In [16]:
awwam_learner.load_model_params(awwam_filename)

In [17]:
test_loss, test_acc = awwam_learner.evaluate(test_loader)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

	 Test Loss: 0.417 | Test Acc: 81.15%


## Cosine similarities between vector u and word embeddings

In [18]:
cosine_similarities = pd.Series(awwam_learner.model.cosine_similarity_to_u, index=vocab.itos).sort_values()

In [19]:
cosine_similarities.tail(15)

underbelly           0.725700
morning              0.726054
kaige                0.726750
irreverent           0.729360
exit                 0.731163
kjell                0.731713
foul                 0.731957
sanctimonious        0.732995
mcadams              0.741316
élan                 0.746318
lack-of-attention    0.753965
mountain             0.755889
inc.                 0.760517
detract              0.775119
buñuel               0.785297
dtype: float32

In [20]:
cosine_similarities.head(15)

the      -0.993271
of       -0.992350
is       -0.991175
in       -0.989943
's       -0.988395
its      -0.987115
abroad   -0.987112
.        -0.986810
--       -0.985075
your     -0.983277
for      -0.982622
hubert   -0.982240
clubs    -0.980168
junior   -0.975402
...      -0.973906
dtype: float32

## Attention variance among frequent words in the training set

In [21]:
import bisect
from collections import defaultdict

MAX_FREQ = 100
upper_bound = len(vocab) - bisect.bisect_right(vocab.freqs[::-1], MAX_FREQ)

@torch.no_grad()
def get_attention_stats(learner):
    attentions = defaultdict(list)
    for batch in train_loader:
        sequences, _ = batch
        sequences = sequences.to(learner.device)
        
        mask = torch.where(sequences < upper_bound, sequences, 0).bool()
        attention = learner.model(sequences, True)
        masked_sequences = torch.masked_select(sequences, mask).tolist()
        masked_attention = torch.masked_select(attention, mask).tolist()
        for i, att in zip(masked_sequences, masked_attention):
            attentions[i].append(att)
        
    return attentions

In [22]:
%%time
stats = get_attention_stats(awwam_learner)

CPU times: user 15.5 s, sys: 60.1 ms, total: 15.6 s
Wall time: 15.5 s


In [23]:
df = pd.DataFrame(columns=['word', 'mean', 'std'])
for k, v in stats.items():
    attentions = torch.Tensor(v)
    df = df.append({'word': vocab.itos[k], 'mean': attentions.mean().item(), 'std': attentions.std().item()}, ignore_index=True)

In [24]:
df = df.sort_values('std', ascending=True)
df = df.set_index('word', drop=True)
df.head(30)

Unnamed: 0_level_0,mean,std
word,Unnamed: 1_level_1,Unnamed: 2_level_1
insight,0.010037,0.001287
your,0.009677,0.001345
high,0.010509,0.001368
made,0.01017,0.001415
character,0.010289,0.001418
very,0.010431,0.001419
seen,0.01027,0.001423
matter,0.010248,0.001424
effort,0.010266,0.001439
takes,0.01047,0.001442
