In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable

from torchtext.vocab import Vectors
from sklearn.model_selection import train_test_split
from mosestokenizer import *
from sacremoses import MosesTokenizer
from html import unescape
from tqdm import tqdm

device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')
torch.cuda.manual_seed(42);
torch.manual_seed(42);

In [2]:
df = pd.read_csv('./lab7/TrainData/train.csv')

In [3]:
df.loc[4]['text']

"You probably all already know this by now, but 5 additional episodes never aired can be viewed on ABC.com I've watched a lot of television over the years and this is possibly my favorite show, ever. It's a crime that this beautifully written and acted show was canceled. The actors that played Laura, Whit, Carlos, Mae, Damian, Anya and omg, Steven Caseman - are all incredible and so natural in those roles. Even the kids are great. Wonderful show. So sad that it's gone. Of course I wonder about the reasons it was canceled. There is no way I'll let myself believe that Ms. Moynahan's pregnancy had anything to do with it. It was in the perfect time slot in this market. I've watched all the episodes again on ABC.com - I hope they all come out on DVD some day. Thanks for reading."

Введем следующие правила для работы с текстом:
1. Начало каждого примера - xxbos. Конец примера - xxeos.
2. Конец предложения внутри примера будет обозначаться - xxsep.
2. Загравные буквы обозначаем - xxmaj. 
3. Цитаты - xxquo.
4. Будет использоваться Moses tokenizer.
5. html.unescape() используется для работы со специальными символами.
6. xxunk - неизвестные слова в тестовой выборке данных.
7. xxpad - для обозначения отступов.

In [4]:
moses = MosesTokenizer()

def process(s, join=True):
#     tokens = MosesTokenizer(s)
    tokens = [unescape(t) for t in moses.tokenize(s)]
    n_tokens = []
    for i, token in enumerate(tokens):
        if i == 0:
            n_tokens.append('xxbos')
            n_tokens.append(token.lower().replace('.', ''))
        else:
            if token == '.':
                n_tokens.append('xxsep')
            elif token[0] == '\'' or token[0] == '"':
                n_tokens.append('xxquo')
            elif token[0].isupper():
                n_tokens.append('xxmaj')
                n_tokens.append(token.lower().replace('.', ''))
            else:
                n_tokens.append(token.lower().replace('.', ''))
    n_tokens.append('xxeos')
    
    if join:
        return " ".join(n_tokens)
    return n_tokens

In [5]:

# Process all the text
li = []
for text in tqdm(list(df['text'])):
    li.append(process(text))
y = list(df['sentiment'])
    
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(li, y, random_state=42)

100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [01:15<00:00, 331.62it/s]


Чтобы конверитровать слова в индексы будем использовать словарь word2idx. И словарь idx2word - для обратной конвертации.

In [6]:
vocab = ['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxmaj', 'xxsep', 'xxquo']
word2idx = {vocab[i]:i for i in range(7)}
vocab = set(vocab)
i = 7

# Build the vocabulary lookups
for line in tqdm(X_train):
    tokens = line.split()
    for token in tokens:
        if token not in vocab:
            vocab.add(token)
            word2idx[token] = i
            i += 1
            
# Reverse the lookup table
idx2word = {i:  word for i, word in enumerate(word2idx.keys())}

# Get the vocabulary size and indicate the classes
vocab_sz = len(vocab)
classes = ['positive', 'negative']

100%|█████████████████████████████████████████████████████████████████████████| 18750/18750 [00:00<00:00, 21365.02it/s]


Выполняем серилализицию для конвертации текста в индексы.

In [7]:
def serialize(sentence):
    return torch.LongTensor([word2idx[token] if token in vocab else word2idx['xxunk'] for token in sentence.split()])

# Serialize all data
X_train = [serialize(s) for s in tqdm(X_train)]
X_test = [serialize(s) for s in tqdm(X_test)]

100%|█████████████████████████████████████████████████████████████████████████| 18750/18750 [00:01<00:00, 13291.49it/s]
100%|███████████████████████████████████████████████████████████████████████████| 6250/6250 [00:00<00:00, 12738.46it/s]


X_train и X_test представляют собой список тензеров, каждый содержащий последовательность индексов. Каждый индекс соответствует слову в словаре данных. Например:

In [8]:
X_train[0]

tensor([ 2,  7,  4,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
        18, 23,  5,  4, 24,  4, 25,  9, 11, 26, 27, 14, 28, 29, 30, 11, 31, 32,
        33, 34, 35, 36, 37,  5,  4, 38, 14,  4, 39,  6, 11, 40, 41, 14, 33, 42,
         4, 39,  6, 43,  5,  4, 16, 14, 44, 45, 46, 47, 14, 48, 49, 14, 50, 15,
        51, 52, 53,  5,  4, 54,  6, 55, 56, 54,  5,  3])

Реализуем разделение на мини-батчи:

In [9]:
def make_batches(x, y, padding_value, bs=32, random_state=-1):
    size = len(x)
    batch_size = bs
    if random_state > -1:
        np.random.seed(random_state)
    perm = np.random.permutation(size) # Shuffles list of indices
    
    iterator = []
    
    for i in range(0, size, batch_size):
        batch_idx = perm[i:i+batch_size] # Batches a number of indices equal to bs
        x_ = [x[i] for i in batch_idx]
        y_ = [y[i] for i in batch_idx]
        
        # Sort x based on length in descending order
        x_, y_ = zip(*sorted(zip(x_, y_), key=lambda b: len(b[0]), reverse=True))
        
        # Convert to tensors, and padd the sequences
        l_ = torch.IntTensor([len(b) for b in x_])
        x_ = rnn_utils.pad_sequence(x_, batch_first=True, padding_value=padding_value).t()
        y_ = torch.LongTensor(y_)
    
        iterator.append((x_, y_, l_))
    
    return iterator

In [10]:
batch_size = 64
padding_value = word2idx['xxpad']

train_loader = make_batches(X_train, y_train, padding_value=padding_value, bs=batch_size, random_state=42)
test_loader =  make_batches(X_test, y_test, padding_value=padding_value, bs=batch_size, random_state=42)

Пример батча для обучающей выборки:

In [11]:
x, y, l = train_loader[0]

print(x)
print("x shape:", x.shape)
print(y)
print("y shape:", y.shape)
print(l)
print("l shape:", l.shape)

tensor([[    2,     2,     2,  ...,     2,     2,     2],
        [   11,    44, 15130,  ...,    39,    39,  3730],
        [  584,    45,    36,  ...,   326,   672,    18],
        ...,
        [ 1806,     1,     1,  ...,     1,     1,     1],
        [  386,     1,     1,  ...,     1,     1,     1],
        [    3,     1,     1,  ...,     1,     1,     1]])
x shape: torch.Size([949, 64])
tensor([0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1])
y shape: torch.Size([64])
tensor([949, 925, 805, 751, 676, 596, 576, 569, 557, 506, 443, 436, 435, 430,
        385, 383, 356, 353, 336, 333, 315, 307, 294, 285, 275, 270, 267, 264,
        258, 256, 254, 250, 244, 241, 233, 224, 222, 219, 211, 205, 204, 193,
        193, 192, 191, 183, 183, 180, 179, 179, 173, 171, 169, 165, 157, 156,
        156, 145, 141, 111,  92,  72,  62,  60], dty

In [12]:
embedding = nn.Embedding(vocab_sz, 300)

In [13]:
out = embedding(x)
print("Output shape:", out.shape)
print(out)

Output shape: torch.Size([949, 64, 300])
tensor([[[ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
         [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
         [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
         ...,
         [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
         [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
         [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225]],

        [[-0.7104, -0.2730,  0.8718,  ..., -0.2636,  0.9189,  0.1502],
         [ 1.2626, -2.6988,  1.1426,  ..., -0.6646,  0.6527, -0.9550],
         [ 1.2716, -0.7555,  1.3774,  ...,  0.7194,  0.6065,  0.4343],
         ...,
         [ 0.9046,  1.0088,  0.7888,  ..., -1.8062,  0.6957, -0.5605],
         [ 0.9046,  1.0088,  0.7888,  ..., -1.8062,  0.6957, -0.5605],
         [-1.7563,  1.2177, -0.6656,  ...,  0.6533, -0.4488, -0.6777]],

        [[-0.9203, -1.3010, -1.6578,  ...,  1.6093, -0.6787,  1.1509],
    

In [14]:
print("Shape of the first item in the output:", out[0].shape)
print(out[0])

Shape of the first item in the output: torch.Size([64, 300])
tensor([[ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        ...,
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225]],
       grad_fn=<SelectBackward>)


In [15]:
rnn = nn.LSTM(300, 128)

In [16]:
(hidden, cell) = torch.zeros(1, 64, 128), torch.zeros(1, 64, 128)
out = embedding(x)
print("Embedded output shape:", out.shape)

out = rnn_utils.pack_padded_sequence(out, l)
print("Packed output shape:", out.data.shape)

out, (hidden, cell) = rnn(out, (hidden, cell))

Embedded output shape: torch.Size([949, 64, 300])
Packed output shape: torch.Size([19601, 300])


In [17]:
hidden.shape

torch.Size([1, 64, 128])

In [18]:
out = hidden[-1, :, :]
print("Final timestep shape:", out.shape)
print(out)

Final timestep shape: torch.Size([64, 128])
tensor([[ 0.0630,  0.0918,  0.0179,  ...,  0.0378,  0.0810,  0.2163],
        [-0.0101,  0.0880,  0.0519,  ...,  0.0954,  0.0803,  0.2292],
        [ 0.0385,  0.2314,  0.0846,  ...,  0.0659,  0.1056,  0.0205],
        ...,
        [ 0.0256,  0.1860,  0.0535,  ...,  0.0956,  0.1263, -0.0291],
        [ 0.0131,  0.1697,  0.0570,  ...,  0.0898,  0.0796,  0.0112],
        [ 0.0392,  0.0518,  0.0605,  ...,  0.0416,  0.0479,  0.3196]],
       grad_fn=<SliceBackward>)


In [19]:
fc = nn.Linear(128, 2) # Initialize with 128 units and 2 output units
out = torch.softmax(fc(out), dim=1)
print("Shape of output:", out.shape)
print(out[:5]) # Print the first five probabilities

Shape of output: torch.Size([64, 2])
tensor([[0.4619, 0.5381],
        [0.4756, 0.5244],
        [0.4783, 0.5217],
        [0.4659, 0.5341],
        [0.4530, 0.5470]], grad_fn=<SliceBackward>)


Классификационная модель LSTM

In [20]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_sz, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_sz, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim
        
    def init_hidden(self, bs):
        return torch.zeros(1, bs, self.hidden_dim), torch.zeros(1, bs, self.hidden_dim)
        
    def forward(self, X, hidden, cell, lengths):
        out = self.embedding(X)
        out = rnn_utils.pack_padded_sequence(out, lengths)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = hidden[-1, :, :]
        out = torch.log_softmax(self.fc(out), dim=1)
        return out, (hidden, cell)

In [21]:
model = LSTMClassifier(vocab_sz=vocab_sz, embedding_dim=300, hidden_dim=128, output_dim=2)
criterion = nn.CrossEntropyLoss()

# Make a forward pass
(hidden, cell) = model.init_hidden(batch_size)
out, (hidden, cell) = model(x, hidden, cell, l)

In [22]:
def accuracy(y_pred, y_acc):
    with torch.no_grad():
        return torch.sum(torch.max(torch.exp(y_pred), dim=1)[1] == y_acc).item() / len(y_acc)
    
print("Accuracy: {:.4f}".format(accuracy(out, y)))
print("Loss: {:.4f}".format(criterion(out, y)))

Accuracy: 0.4844
Loss: 0.6970


In [23]:
model = LSTMClassifier(vocab_sz=vocab_sz, embedding_dim=300, hidden_dim=128, output_dim=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [24]:
epochs = 4
for e in range(1, epochs + 1):
    train_loss = 0
    train_acc = 0
    
    model.train()
    for batch in tqdm(train_loader, leave=True):
        x, y, lens = batch
        x, y, lens = x.cuda(), y.cuda(), lens.cuda()

        # Initialize the hidden state
        (hidden, cell) = model.init_hidden(x.shape[1])
        hidden = hidden.to(device)
        cell = cell.to(device)
        
        # Forward pass and backprop
        out, (hidden, cell) = model(x, hidden, cell, lens)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += accuracy(out, y)

    # Scale accuracy and losses
    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    test_loss = 0
    test_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, leave=True):
            x, y, lens = batch
            x, y, lens = x.cuda(), y.cuda(), lens.cuda()

            (hidden, cell) = model.init_hidden(x.shape[1])
            hidden = hidden.to(device)
            cell = cell.to(device)
            
            out, (hidden, cell) = model(x, hidden, cell, lens)
            loss = criterion(out, y)

            test_loss += loss.item()
            test_acc += accuracy(out, y)

    test_loss /= len(test_loader)
    test_acc /= len(test_loader)
    
    print("Epoch {:4} | Train Loss {:.4f} | Train Acc {:.4f} | Test Loss {:.4f} | Test Acc {:.4f}".format(e, train_loss, train_acc, test_loss, test_acc))

100%|████████████████████████████████████████████████████████████████████████████████| 293/293 [00:32<00:00,  9.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [00:02<00:00, 36.74it/s]
  0%|                                                                                          | 0/293 [00:00<?, ?it/s]

Epoch    1 | Train Loss 0.6848 | Train Acc 0.5422 | Test Loss 0.6762 | Test Acc 0.5701


100%|████████████████████████████████████████████████████████████████████████████████| 293/293 [00:31<00:00,  9.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [00:02<00:00, 36.80it/s]
  0%|                                                                                          | 0/293 [00:00<?, ?it/s]

Epoch    2 | Train Loss 0.6247 | Train Acc 0.6350 | Test Loss 0.5645 | Test Acc 0.7005


100%|████████████████████████████████████████████████████████████████████████████████| 293/293 [00:31<00:00,  9.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [00:02<00:00, 36.31it/s]
  0%|▎                                                                                 | 1/293 [00:00<00:29,  9.95it/s]

Epoch    3 | Train Loss 0.4970 | Train Acc 0.7540 | Test Loss 0.4616 | Test Acc 0.7803


100%|████████████████████████████████████████████████████████████████████████████████| 293/293 [00:31<00:00,  9.42it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [00:02<00:00, 36.38it/s]

Epoch    4 | Train Loss 0.4006 | Train Acc 0.8203 | Test Loss 0.4257 | Test Acc 0.8066





In [25]:
model.cpu()
test = "The movie was good! I liked the characters and the soundtrack. Overall impressive."

# Process the sentence and unsqueeze to make a "batch of 1"
test = torch.LongTensor(serialize(process(test))).unsqueeze(1) 
lengths = torch.LongTensor([len(test)])
(hidden, cell) = model.init_hidden(1)

with torch.no_grad():
    out, _ = model(test, hidden, cell, lengths)
    out = torch.exp(out)
    m = torch.max(out, dim=1)
    print("Prediction: {} | Confidence: {:.4f}".format(classes[m[1].item()], m[0].item()))

Prediction: positive | Confidence: 0.8567


Двунаправленные реккурентные сети

In [26]:
x, y, l = train_loader[0]
embedding = nn.Embedding(vocab_sz, 300)
rnn = nn.LSTM(300, 128, bidirectional=True)

In [27]:
(hidden, cell) = torch.zeros(2, 64, 128), torch.zeros(2, 64, 128) # Note the 2 in the first dimension
out = embedding(x)
print("Embedded output shape:", out.shape)

out = rnn_utils.pack_padded_sequence(out, l)
print("Packed output shape:", out.data.shape)

out, (hidden, cell) = rnn(out, (hidden, cell))

Embedded output shape: torch.Size([949, 64, 300])
Packed output shape: torch.Size([19601, 300])


In [28]:
hidden.shape

torch.Size([2, 64, 128])

In [29]:
h_cat = torch.cat([hidden[-1, :, :], hidden[-2, :, :]], dim=1)
print("Concatenated shape:", h_cat.shape)

Concatenated shape: torch.Size([64, 256])


In [30]:
fc = nn.Linear(128 * 2, 2) # Note the * 2
out = torch.softmax(fc(h_cat), dim=1)
print("Final shape:", out.shape)
print(out[:5])

Final shape: torch.Size([64, 2])
tensor([[0.5321, 0.4679],
        [0.5066, 0.4934],
        [0.5573, 0.4427],
        [0.5126, 0.4874],
        [0.5096, 0.4904]], grad_fn=<SliceBackward>)


In [31]:
num_layers = 2

x, y, l = train_loader[0]
embedding = nn.Embedding(vocab_sz, 300)
rnn = nn.LSTM(300, 128, num_layers=num_layers)

In [32]:
(hidden, cell) = torch.zeros(num_layers, 64, 128), torch.zeros(num_layers, 64, 128) # We now generalize number of layers
out = embedding(x)
print("Embedded output shape:", out.shape)

out = rnn_utils.pack_padded_sequence(out, l)
print("Packed output shape:", out.data.shape)

out, (hidden, cell) = rnn(out, (hidden, cell))

Embedded output shape: torch.Size([949, 64, 300])
Packed output shape: torch.Size([19601, 300])


In [33]:
out = hidden[-1, :, :]

fc = nn.Linear(128, 2) 
out = torch.softmax(fc(out), dim=1)
print("Final shape:", out.shape)
print(out[:5]) # First five probabilities

Final shape: torch.Size([64, 2])
tensor([[0.4780, 0.5220],
        [0.4784, 0.5216],
        [0.4776, 0.5224],
        [0.4760, 0.5240],
        [0.4751, 0.5249]], grad_fn=<SliceBackward>)


In [34]:
num_layers = 2
bidirectional = True
recur_drop = 0.3

x, y, l = train_loader[0]
embedding = nn.Embedding(vocab_sz, 300)
rnn = nn.LSTM(300, 128, num_layers=num_layers, bidirectional=bidirectional, dropout=recur_drop)

In [35]:
if bidirectional: # Double the first dimension if bidirectional
    (hidden, cell) = torch.zeros(num_layers * 2, 64, 128), torch.zeros(num_layers * 2, 64, 128)
else:
    (hidden, cell) = torch.zeros(num_layers, 64, 128), torch.zeros(num_layers, 64, 128)

In [36]:
out = embedding(x)
print("Embedded output shape:", out.shape)

out = rnn_utils.pack_padded_sequence(out, l)
print("Packed output shape:", out.data.shape)

out, (hidden, cell) = rnn(out, (hidden, cell))

Embedded output shape: torch.Size([949, 64, 300])
Packed output shape: torch.Size([19601, 300])


In [37]:
if bidirectional:
    out = torch.cat([hidden[-1, :, :], hidden[-2, :, :]], dim=1)
else:
    out = hidden[-1, :, :]

fc = nn.Linear(128 * 2, 2) if bidirectional else nn.Linear(128, 2) # Branch if bidirectional
out = torch.softmax(fc(out), dim=1)
print("Final shape:", out.shape)
print(out[:5]) # First five probabilities

Final shape: torch.Size([64, 2])
tensor([[0.4912, 0.5088],
        [0.4786, 0.5214],
        [0.4960, 0.5040],
        [0.4828, 0.5172],
        [0.4891, 0.5109]], grad_fn=<SliceBackward>)


In [38]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_sz, embedding_dim, hidden_dim, output_dim, bidirectional, rnn_layers, recur_dropout=0.3):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_sz, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=bidirectional, num_layers=rnn_layers, dropout=recur_dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim) if bidirectional else nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim
        
    def init_hidden(self, bs):
        if self.rnn.bidirectional:
            return torch.zeros(self.rnn.num_layers * 2, bs, self.hidden_dim), torch.zeros(self.rnn.num_layers * 2, bs, self.hidden_dim)
        else:
            return torch.zeros(self.rnn.num_layers, bs, self.hidden_dim), torch.zeros(self.rnn.num_layers, bs, self.hidden_dim)
        
    def forward(self, X, hidden, cell, lengths):
        out = self.embedding(X)
        out = rnn_utils.pack_padded_sequence(out, lengths)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        h_cat = torch.cat([ hidden[-2, :, :], hidden[-1, :, :] ], dim=1) if self.rnn.bidirectional else hidden[-1, :, :]
        out = torch.log_softmax(self.fc(h_cat), dim=1)
        return out, (hidden, cell)

In [39]:
model = LSTMClassifier(vocab_sz, embedding_dim=300, hidden_dim=128, output_dim=2, 
                       bidirectional=True, rnn_layers=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
epochs = 4
for e in range(1, epochs + 1):
    train_loss = 0
    train_acc = 0
    for batch in tqdm(train_loader, leave=True):
        x, y, lens = batch
        x = x.cuda()
        y = y.cuda()
        lens = lens.cuda()

        (hidden, cell) = model.init_hidden(x.shape[1])
        hidden = hidden.to(device)
        cell = cell.to(device)
        out, (hidden, cell) = model(x, hidden, cell, lens)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += accuracy(out, y)

    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    test_loss = 0
    test_acc = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, leave=True):
            x, y, lens = batch
            x = x.cuda()
            y = y.cuda()
            lens = lens.cuda()

            (hidden, cell) = model.init_hidden(x.shape[1])
            hidden = hidden.to(device)
            cell = cell.to(device)
            out, (hidden, cell) = model(x, hidden, cell, lens)
            loss = criterion(out, y)

            test_loss += loss.item()
            test_acc += accuracy(out, y)

    test_loss /= len(test_loader)
    test_acc /= len(test_loader)
    
    print("Epoch {:4} | Train Loss {:.4f} | Train Acc {:.4f} | Test Loss {:.4f} | Test Acc {:.4f}".format(e, train_loss, train_acc, test_loss, test_acc))

  4%|███▎                                                                             | 12/293 [00:03<01:29,  3.15it/s]

In [None]:
model.cpu()

test = "That movie was awful."
test = torch.LongTensor(serialize(process(test))).unsqueeze(1)
lengths = torch.LongTensor([len(test)])
(hidden, cell) = model.init_hidden(1)

with torch.no_grad():
    out, _ = model(test, hidden, cell, lengths)
    out = torch.exp(out)
    m = torch.max(out, dim=1)
    print("Prediction: {} | Confidence: {:.4f}".format(classes[m[1].item()], m[0].item()))

Использование GloVe

In [None]:
v = Vectors('../input/glove840b300dtxt/glove.840B.300d.txt')

In [None]:
out = v['hello']
print("Embedding of 'hello' dimensions:", out.shape)
print(out)

In [None]:
w = (v['woman'] - v['queen']).unsqueeze(0)

l2_dist = nn.PairwiseDistance(p=2)
out = l2_dist(m, w)

print("L2 Distance:", out.item())

In [None]:
emb = nn.Embedding(vocab_sz, 300)
batch = []
for i in range(vocab_sz):
    batch.append(v[idx2word[i]])
weights = torch.stack(batch)
emb.weight.data.copy_(weights)
emb.weight.requires_grad = False

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_sz, embedding_dim, hidden_dim, output_dim, bidirectional, rnn_layers, recur_dropout=0.3, pretrained=False, pretrained_emb=None):
        super(LSTMClassifier, self).__init__()
        if pretrained and pretrained_emb is not None:
            self.embedding = pretrained_emb
        else:
            self.embedding = nn.Embedding(vocab_sz, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=bidirectional, num_layers=rnn_layers, dropout=recur_dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim) if bidirectional else nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim
        
    def init_hidden(self, bs):
        if self.rnn.bidirectional:
            return torch.zeros(self.rnn.num_layers * 2, bs, self.hidden_dim), torch.zeros(self.rnn.num_layers * 2, bs, self.hidden_dim)
        else:
            return torch.zeros(self.rnn.num_layers, bs, self.hidden_dim), torch.zeros(self.rnn.num_layers, bs, self.hidden_dim)
        
    def forward(self, X, hidden, cell, lengths):
        out = self.embedding(X)
        out = rnn_utils.pack_padded_sequence(out, lengths)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        h_cat = torch.cat([ hidden[-2, :, :], hidden[-1, :, :] ], dim=1) if self.rnn.bidirectional else hidden[-1, :, :]
        out = torch.log_softmax(self.fc(h_cat), dim=1)
        return out, (hidden, cell)

In [None]:
model = LSTMClassifier(vocab_sz, embedding_dim=300, hidden_dim=128, output_dim=2, 
                       bidirectional=True, rnn_layers=2, pretrained=True, pretrained_emb=emb).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
epochs = 4
for e in range(1, epochs + 1):
    train_loss = 0
    train_acc = 0
    for batch in tqdm(train_loader, leave=True):
        x, y, lens = batch
        x = x.cuda()
        y = y.cuda()
        lens = lens.cuda()

        (hidden, cell) = model.init_hidden(x.shape[1])
        hidden = hidden.to(device)
        cell = cell.to(device)
        out, (hidden, cell) = model(x, hidden, cell, lens)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += accuracy(out, y)

    train_loss /= len(train_loader)
    train_acc /= len(train_loader)

    test_loss = 0
    test_acc = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, leave=True):
            x, y, lens = batch
            x = x.cuda()
            y = y.cuda()
            lens = lens.cuda()

            (hidden, cell) = model.init_hidden(x.shape[1])
            hidden = hidden.to(device)
            cell = cell.to(device)
            out, (hidden, cell) = model(x, hidden, cell, lens)
            loss = criterion(out, y)

            test_loss += loss.item()
            test_acc += accuracy(out, y)

    test_loss /= len(test_loader)
    test_acc /= len(test_loader)
    
    print("Epochs {:4} | Train Loss {:.4f} | Train Acc {:.4f} | Test Loss {:.4f} | Test Acc {:.4f}".format(e, train_loss, train_acc, test_loss, test_acc))

In [None]:
model.cpu()

test = "I loved the movie! The cinematography was terrific and the actors were great."
test = torch.LongTensor(serialize(process(test))).unsqueeze(1)
lengths = torch.LongTensor([len(test)])
(hidden, cell) = model.init_hidden(1)

with torch.no_grad():
    out, _ = model(test, hidden, cell, lengths)
    out = torch.exp(out)
    m = torch.max(out, dim=1)
    print("Prediction: {} | Confidence: {:.4f}".format(classes[m[1].item()], m[0].item()))