# Use Bi-LSTM  Realize Sentiment Analysis

In [1]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, datasets
from torchtext.vocab import GloVe

## config

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

RANDOM_SEED = 2020
MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 128
EMBEDDING_DIM = 100
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True

## load dataset

In [3]:
# define datatype
# method 1
# python -m spacy download en
# method 2
# step 1: manual download from https://github-production-release-asset-2e65be.s3.amazonaws.com/84940268/69ded28e-c3ef-11e7-94dc-d5b03d9597d8?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201214%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201214T064457Z&X-Amz-Expires=300&X-Amz-Signature=631b41e8491a84dfb7c492f336d728f116a04f677c33cf709dd719d5cf4c126f&X-Amz-SignedHeaders=host&actor_id=26615837&key_id=0&repo_id=84940268&response-content-disposition=attachment%3B%20filename%3Den_core_web_sm-2.0.0.tar.gz&response-content-type=application%2Foctet-stream
# step 2: remove to /home/alex/anaconda3/envs/pytorch/lib/python3.6/site-packages/spacy/data
# step 3: $ pip install en_core_web_sm-2.0.0.tar.gz
# step 4: $ spacy link en_core_web_sm en
TEXT = data.Field(tokenize='spacy', include_lengths=True)
LABEL = data.LabelField(sequential=False, dtype=torch.float32)



### get train val test data

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root='../Dataset/IMDB')

print(train_data.fileds)
print(train_data.examples[0])

train_data, val_data = train_data.split(random_state = random.seed(RANDOM_SEED))

print('Number of train data {}'.format(len(train_data)))
print('Number of val data {}'.format(len(val_data)))
print('Number of val data {}'.format(len(test_data)))



<generator object Dataset.__getattr__ at 0x7f363e500780>
<torchtext.data.example.Example object at 0x7f363e461710>
Number of train data 17500
Number of val data 7500
Number of val data 25000


### initial vocabulary with GLOVE model

In [5]:
TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors=GloVe(name='6B', dim=EMBEDDING_DIM, cache='../Dataset/GloVe'),
                 min_freq=0)

LABEL.build_vocab(train_data)
print('Unique token in Text vocabulary: {}'.format(len(TEXT.vocab))) # 250002(<unk>, <pad>)
print(TEXT.vocab.itos)
print('Unique token in LABEL vocabulary: {}'.format(len(LABEL.vocab)))
print(LABEL.vocab.itos)

Unique token in Text vocabulary: 25002
Unique token in LABEL vocabulary: 2
['neg', 'pos']


In [6]:
print('Top 20 frequency of word: \n {}'.format(TEXT.vocab.freqs.most_common(20)))
print('Embedding shape: {}'.format(TEXT.vocab.vectors.shape))

Top 20 frequency of word: 
 [('the', 203635), (',', 193389), ('.', 165948), ('and', 109975), ('a', 109323), ('of', 101316), ('to', 94212), ('is', 76447), ('in', 61446), ('I', 54428), ('it', 53487), ('that', 49711), ('"', 44944), ("'s", 43478), ('this', 42716), ('-', 37299), ('/><br', 35787), ('was', 35162), ('as', 30684), ('with', 29983)]
Embedding shape: torch.Size([25002, 100])


## geenrator dataloader

In [7]:
train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, device=device, shuffle=True)
val_iter, test_iter = data.BucketIterator.splits((val_data, test_data), batch_size=BATCH_SIZE, device=device,
                                          sort_within_batch=True)

for batch_data in train_iter:
    print(batch_data.text[0]) # content
    print(batch_data.text[1]) # length
    print(batch_data.label)
    break



tensor([[ 482,   66,   25,  ...,   66, 4472,   50],
        [   3,   22,  105,  ...,    9, 6127,    0],
        [   2,    9, 1565,  ...,   43,  426,   50],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
tensor([ 256,  140, 1017,  327,  146,  142,  189,  315,  242,  223,  133,  138,
         100,  100,   79,  229,  166,  299,  188,  134,  412,   81,  180,  407,
         290,  409,  352,  257,  146,  455,  216,  229,  178,  547,  177,  329,
         348,  137,   95,  179,  371,  178, 1113,  171,  900,  636,  178,  139,
         109,  509,   98,  203,  175,  135,  295,  156,  721,  157,   86,  577,
         113,  140,  145,  121,  147,  153,   93,  156,  406,  613,  157,  137,
         602,  373,  145,  210,  147,  133,  415,  205,  171,  171,  236,  783,
         689,   62,  204,  253,  404,   63,  155,   59,  153,  121,  137,   66,
         235,  39

## build model

In [8]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layer, pad_index,
                 bidirectional=False, dropout=0.5):
        super(BiLSTM, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embedding_dim,
                                      padding_idx=pad_index)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=num_layer,
                            bidirectional=bidirectional,
                            dropout=dropout)

        if bidirectional:
            self.fc = nn.Linear(hidden_size * 2, output_size)
        else:
            self.fc = nn.Linear(hidden_size, output_size)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, text, text_length):
        """

        :param text: (seq_len, batch_size)
        :param text_length:
        :return:
        """
        # embedded => [seq_len, batch_size, embedding_dim]
        embedded = self.embedding(text)
        embedded = self.dropout(embedded)
        text_length = text_length.cpu()  # compatible torch=1.7.0
        # pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length, batch_first=False, enforce_sorted=False)

        # lstm
        # h_n => (num_direction * num_layers, batch_size, hidden_size)
        # c_n => (num_direction * num_layers, batch_size, hidden_size)
        packed_output, (h_n, c_n) = self.lstm(packed_embedded)

        # unpacked sequence
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=False)

        # hidden => (batch_size, hidden_size*num_direction)
        hidden = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        hidden = self.dropout(hidden)

        out = self.fc(hidden)
        return out

In [9]:
# construct model
VOCAB_SIZE = len(TEXT.vocab)
HIDDEN_SIZE = 256
OUTPUT_SIZE = 1
NUM_LAYER = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_INDEX = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTM(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, output_size=OUTPUT_SIZE, 
               num_layer=NUM_LAYER, bidirectional=BIDIRECTIONAL, dropout=DROPOUT, pad_index=PAD_INDEX)