In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## 1. Import packages

In [None]:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32
epochs = 10
embedding_dim = 300
hidden_dim = 200
max_seq_length = 64

## 2. Bulid dataloader and vocab

In [None]:
data_path = "/content/drive/My Drive/SST-2-sentiment-analysis/data/"
train_df = pd.read_csv(os.path.join(data_path,"train.tsv"),sep='\t',header=None, names=['similarity','s1'])
dev_df = pd.read_csv(os.path.join(data_path,"dev.tsv"),sep='\t',header=None, names=['similarity','s1'])
test_df = pd.read_csv(os.path.join(data_path,"test.tsv"),sep='\t',header=None, names=['similarity','s1'])

# define Field
tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=max_seq_length)
LABEL = data.Field(sequential=False, use_vocab=False)

# get_dataset constructs and returns the examples and fields required by the Dataset
def get_dataset(csv_data, text_field, label_field, test=False):
    fields = [('id', None), ('sentence', text_field), ('label', label_field)]
    examples = []  
    for text, label in tqdm(zip(csv_data['s1'], csv_data['similarity'])):
      examples.append(data.Example.fromlist([None, text, label], fields))
    return examples, fields

# Get the examples and fields needed to build the Dataset
train_examples, train_fields = get_dataset(train_df, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(dev_df, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_df, TEXT, LABEL)

# Build Dataset
train = data.Dataset(train_examples, train_fields)
valid = data.Dataset(valid_examples, valid_fields)
test = data.Dataset(test_examples, test_fields)

6920it [00:00, 87534.71it/s]
872it [00:00, 76830.37it/s]
1821it [00:00, 22779.90it/s]


In [None]:
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_dim))
LABEL.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                          
100%|█████████▉| 399817/400000 [00:38<00:00, 10676.96it/s]

In [None]:
from torchtext.data import BucketIterator

# make splits for data
train_iter, valid_iter = BucketIterator.splits(
        (train, valid), 
        batch_size=batch_size, 
        device=device, 
        sort_key=lambda x: len(x.sentence),
        sort_within_batch=True,
        repeat=False 
)

test_iter = data.BucketIterator(dataset=test, batch_size=batch_size, device=device, 
        sort_key=lambda x: len(x.sentence), shuffle=False, sort_within_batch=True, repeat=False)

## 3. Define 2 types of BiLSTM_Attention Models

In [None]:
################################################################################
"""
B: batchsize
L: max_seq_length
H: hidden_dim
E: embedding_dim
"""
################################################################################

class SelfAttention_1(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.projection = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, 1)
        )

    def forward(self, encoder_outputs):
        batch_size = encoder_outputs.size(0)
        energy = self.projection(encoder_outputs)
        # (B, L, H) -> (B, L, 1)
        weights = F.softmax(energy.squeeze(-1), dim=1)
        # (B, L, 1) -> (B, L)
        outputs = (encoder_outputs * weights.unsqueeze(-1)).sum(dim=1)
        # (B, L, H) * (B, L, 1) -> (B, H)
        return outputs, weights

class SelfAttention_2(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.w_omega = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.u_omega = nn.Parameter(torch.Tensor(hidden_dim, 1))
        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)

    def forward(self, encoder_outputs):
        u = torch.tanh(torch.matmul(encoder_outputs, self.w_omega))
        # (B, L, H) . (H, H) -> (B, L, H)
        att = torch.matmul(u, self.u_omega)
        # (B, L, H) . (H, 1) -> (B, L, 1)
        att_weight = F.softmax(att, dim=1)
        # (B, L, 1) -> (B, L, 1)
        scored_words = encoder_outputs * att_weight
        # (B, L, H) * (B, L, 1) -> (B, L, H)
        context = torch.sum(scored_words, dim=1)
        # (B, L, H) -> (B, H)
        return context, att_weight.squeeze(-1)

class AttnClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, attention_type):
        super().__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(self.input_dim, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, bidirectional=True)
        # define the attention
        self.attention_type = attention_type
        if (self.attention_type == 1):
            self.attention = SelfAttention_1(self.hidden_dim)
        elif (attention_type == 2):
            self.attention = SelfAttention_2(self.hidden_dim)
        else:
            raise Exception("Invalid attention_type!", attention_type)

        self.fc = nn.Linear(self.hidden_dim, 1)
        
    def set_embedding(self, vectors):
        self.embedding.weight.data.copy_(vectors)

    def dropout(self, v):
        return F.dropout(v, p=0.5, training=self.training)

    def forward(self, inputs, lengths):
        batch_size = inputs.size(1)
        embedded = self.embedding(inputs)
        embedded = self.dropout(embedded)
        # (L, B)
        packed_emb = nn.utils.rnn.pack_padded_sequence(embedded, lengths)
        # (L, B, E)
        out, hidden = self.lstm(packed_emb)
        out, _ = nn.utils.rnn.pad_packed_sequence(out)
        out = out[:, :, :self.hidden_dim] + out[:, :, self.hidden_dim:] # forward + backward
        # (L, B, H)
        embedding, attn_weights = self.attention(out.transpose(0, 1))
        # (B, H), (B, L)
        outputs = self.fc(embedding.view(batch_size, -1))
        # (B, 1)
        return outputs, attn_weights

In [None]:
def get_length(x):
    length = []
    for i in x.transpose(0, 1).cpu().tolist():
        length.append(len(i)-i.count(1))
    return length

def train(train_iter, model, optimizer, criterion):
    model.train()
    epoch_loss = 0
    bar = tqdm(total=len(train_iter))
    b_ix = 1
    for batch in train_iter:
        x, y = batch.sentence, batch.label
        optimizer.zero_grad()
        outputs, _ = model(x, get_length(x))
        loss = criterion(outputs.view(-1), y.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        if b_ix % 10 == 0:
            bar.update(10)
            bar.set_description('current loss:{:.4f}'.format(epoch_loss / b_ix))
        b_ix += 1
    bar.update((b_ix - 1) % 10)
    bar.close()
    return epoch_loss / len(train_iter)

In [None]:
def binary_accuracy(preds, y):
    # round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

def validate(model, valid_iter):
    model.eval()
    total_acc = 0
    for i, batch in enumerate(valid_iter):
        x, y = batch.sentence, batch.label
        outputs, _ = model(x, get_length(x))
        total_acc += binary_accuracy(outputs.view(-1), y.float()).item()
    return total_acc / len(valid_iter)

## 4. Model training and testing

In [None]:
def trian_validate(epoch, model, optimizer, criterion, train_iter, valid_iter, patience):
    patience_counter = 0
    best_score = 0.0
    for epoch in range(epochs):
        train(train_iter, model, optimizer, criterion)
        dev_accuracy = validate(model, valid_iter)
    
        if (dev_accuracy < best_score):
            patience_counter += 1
        else:
            best_score = dev_accuracy
            patience_counter = 0

        if (patience_counter >= patience):
            print("-> Early stopping: patience limit reached, stopping...")
            break

### 4.1 The first attention model

In [None]:
model_1 = AttnClassifier(len(TEXT.vocab), embedding_dim, hidden_dim, attention_type=1).to(device)
model_1.set_embedding(TEXT.vocab.vectors)
optimizer = optim.Adam(model_1.parameters(), lr=1e-3, weight_decay=0)
criterion = nn.BCEWithLogitsLoss().to(device)
patience = 1

# train the model and stop when the accuracy of dev set is not imporved
trian_validate(epochs, model_1, optimizer, criterion, train_iter, valid_iter, patience)


  0%|          | 0/217 [00:00<?, ?it/s][A
current loss:0.6842:   5%|▍         | 10/217 [00:00<00:01, 107.04it/s][A
current loss:0.6842:   9%|▉         | 20/217 [00:00<00:01, 109.00it/s][A
current loss:0.6800:   9%|▉         | 20/217 [00:00<00:01, 109.00it/s][A
current loss:0.6731:  14%|█▍        | 30/217 [00:00<00:01, 109.00it/s][A
current loss:0.6731:  18%|█▊        | 40/217 [00:00<00:01, 111.18it/s][A
current loss:0.6593:  18%|█▊        | 40/217 [00:00<00:01, 111.18it/s][A
current loss:0.6423:  23%|██▎       | 50/217 [00:00<00:01, 111.18it/s][A
current loss:0.6423:  28%|██▊       | 60/217 [00:00<00:01, 111.88it/s][A
current loss:0.6326:  28%|██▊       | 60/217 [00:00<00:01, 111.88it/s][A
current loss:0.6138:  32%|███▏      | 70/217 [00:00<00:01, 111.88it/s][A
current loss:0.6138:  37%|███▋      | 80/217 [00:00<00:01, 113.90it/s][A
current loss:0.6013:  37%|███▋      | 80/217 [00:00<00:01, 113.90it/s][A
current loss:0.5898:  41%|████▏     | 90/217 [00:00<00:01, 113.90it/

-> Early stopping: patience limit reached, stopping...





In [None]:
# test the model
print("test accuracy: {}".format(validate(model_1, test_iter)))

test accuracy: 0.8548094368817514


### 4.2 The second attention model

In [None]:
model_2 = AttnClassifier(len(TEXT.vocab), embedding_dim, hidden_dim, attention_type=2).to(device)
model_2.set_embedding(TEXT.vocab.vectors)
optimizer = optim.Adam(model_2.parameters(), lr=1e-3, weight_decay=0)
criterion = nn.BCEWithLogitsLoss().to(device)
patience = 1

# train the model and stop when the accuracy of dev set is not imporved
trian_validate(epochs, model_2, optimizer, criterion, train_iter, valid_iter, patience)


  0%|          | 0/217 [00:00<?, ?it/s][A
  5%|▍         | 10/217 [00:00<00:02, 87.84it/s][A
current loss:0.6857:   5%|▍         | 10/217 [00:00<00:02, 87.84it/s][A
current loss:0.6857:   9%|▉         | 20/217 [00:00<00:02, 89.26it/s][A
current loss:0.6752:   9%|▉         | 20/217 [00:00<00:02, 89.26it/s][A
current loss:0.6752:  14%|█▍        | 30/217 [00:00<00:02, 91.61it/s][A
current loss:0.6666:  14%|█▍        | 30/217 [00:00<00:02, 91.61it/s][A
current loss:0.6666:  18%|█▊        | 40/217 [00:00<00:01, 90.33it/s][A
current loss:0.6479:  18%|█▊        | 40/217 [00:00<00:01, 90.33it/s][A
current loss:0.6479:  23%|██▎       | 50/217 [00:00<00:01, 90.05it/s][A
current loss:0.6313:  23%|██▎       | 50/217 [00:00<00:01, 90.05it/s][A
current loss:0.6313:  28%|██▊       | 60/217 [00:00<00:01, 92.67it/s][A
current loss:0.6244:  28%|██▊       | 60/217 [00:00<00:01, 92.67it/s][A
current loss:0.6244:  32%|███▏      | 70/217 [00:00<00:01, 92.67it/s][A
current loss:0.6174:  32%|██

-> Early stopping: patience limit reached, stopping...


In [None]:
# test the model
print("test accuracy: {}".format(validate(model_2, test_iter)))

test accuracy: 0.850310042239072


## 5. Attention visualization

In [None]:
from IPython.display import HTML, display

def highlight(word, attn):
    html_color = '#%02X%02X%02X' % (255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}">{}</span>'.format(html_color, word)

def mk_html(seq, attns):
    html = ""
    for ix, attn in zip(seq, attns):
        html += ' ' + highlight(
            TEXT.vocab.itos[ix],
            attn
        )
    return html + "<br><br>\n"

def visualization(model, test_iter):
    with torch.no_grad():
        for batch in test_iter:
            x, y = batch.sentence, batch.label
            outputs, attn_weights = model(x, get_length(x))
            # show the correctly classified sentences of the first batch
            for i in range(batch_size):
                # if torch.round(F.sigmoid(outputs[i])) == y[i].float():
                    # print(attn_weights[i].cpu().numpy())
                prediction = int(torch.round(torch.sigmoid(outputs[i])).item())
                label = int(y[i].float().item())
                print("Label: {}, Prediction: {}".format(label, prediction))
                text = mk_html(x.t()[i].cpu().numpy(), attn_weights[i].cpu().numpy())
                display(HTML(text))
                break

### 5.1 The first attention model

In [None]:
visualization(model_1, test_iter)

Label: 1, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 1


Label: 0, Prediction: 1


Label: 0, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 1


Label: 0, Prediction: 1


Label: 1, Prediction: 1


Label: 1, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


### 5.2 The sencond attention model

In [None]:
visualization(model_2, test_iter)

Label: 1, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 1


Label: 0, Prediction: 1


Label: 0, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 0, Prediction: 1


Label: 1, Prediction: 1


Label: 1, Prediction: 0


Label: 0, Prediction: 0


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 0, Prediction: 0


Label: 1, Prediction: 1


Label: 1, Prediction: 0


## 6. Conclusion
 

*   ### *The sentences of each batch are sorted by its length, so the output are also sorted.* 
*   ### *Actually, we can see from the visualized result that the bilstm_attention models capture the key words to identify if the sentiment of the sentence is positive.*
*   ### *We could also found that the two different BiLSTM_attention models pay different attention while classifying the same sentence.*



