In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader

import torchtext #conda install torchtext -c pytorch
from torchtext.data import get_tokenizer

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

import numpy as np 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 10
np.random.seed(SEED) 
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  # cuda algorithms
os.environ['PYTHONHASHSEED'] = str(SEED)

## Text Classification

In [3]:
tokenizer = get_tokenizer('basic_english')

words = set()

ct = 0
with open('/nfs/nas-4.1/azyen/w2v_project/c_test_1_stars.txt', encoding='utf-8',  errors='ignore') as f:
    for l in f:
        if ct == 10000:
            break
        sent_arr = set(tokenizer(l.split('\t')[1]))
        words = words | sent_arr
        ct += 1

words = ["", "UNK"] + list(words)
print(len(words))

29593


In [4]:
#creating vocabulary

vocab2index = {k: v for v, k in enumerate(words)}
vocab_size = len(words)

In [5]:
# set up token indice in each sentence
def encode_sentence(tokenized_sent, vocab2index, N=500):
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized_sent])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded

In [6]:
num_labels = 3


def categorize(star):
    if star > 3:
        y = 2 # positive
    elif star < 3:
        y = 0 # negative
    else:
        y = 1 # neutral
    #print(star, label)
    return y

target_classes = ["negative", "neutral", "positive"]

X, Y = [], []
ct = 0
with open('/nfs/nas-4.1/azyen/w2v_project/c_test_1_stars.txt', encoding='utf-8',  errors='ignore') as f:
    for l in f:
        if ct == 10000:
            break
        star, tokenized_review = float(l.split('\t')[0]), tokenizer(l.split('\t')[1])
        x, y = encode_sentence(tokenized_review, vocab2index), categorize(star)
        x_tensor = torch.from_numpy(x)
        X.append(x_tensor)
        Y.append(y)
        ct += 1
        

X = torch.stack(X)
Y = torch.LongTensor(Y)

In [7]:
words[4707]

'beds-'

In [8]:
with open('/nfs/nas-4.1/azyen/w2v_project/c_test_1_stars.txt', encoding='utf-8',  errors='ignore') as f:
    for l in f:
        star, tokenized_review = float(l.split('\t')[0]), tokenizer(l.split('\t')[1])
        print(tokenized_review)
        x = encode_sentence(tokenized_review, vocab2index)
        print(x)
        break

['swiss', 'experience', 'in', 'nyc', 'great', 'stay', 'in', 'nyc', 'thanks', 'to', 'blue', 'moon', 'hotel', 'with', 'its', 'very', 'friendly', 'staff', '.', 'fine', 'rooms', ',', 'very', 'quiet', '.', 'excellent', 'location', 'in', 'downtown', '.', 'highly', 'recommandable']
[14069  3530  3062  9449 26902 24043  3062  9449  8398 16353 17504 14215
 26066  7080  4683 11213 17815 26265 15875 18616  1805    14 11213 21318
 15875  6700  7902  3062  7280 15875  4052 27743     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0 

In [9]:
X

tensor([[14069,  3530,  3062,  ...,     0,     0,     0],
        [27151, 26066,    14,  ...,     0,     0,     0],
        [16037, 28067, 21314,  ...,     0,     0,     0],
        ...,
        [27975,  5962,  1701,  ...,     0,     0,     0],
        [22326, 29151, 11757,  ...,     0,     0,     0],
        [16037, 11730,  3062,  ...,     0,     0,     0]])

In [10]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __getitem__(self,index):
        seq_lens = np.count_nonzero(self.data[index])
        return self.data[index], self.label[index], seq_lens

    def __len__(self):
        return len(self.data)

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


train_data = MyDataset(X[:8000], Y[:8000])
valid_data = MyDataset(X[8000:9000], Y[8000:9000])
test_data = MyDataset(X[9000:], Y[9000:])

train_sampler = RandomSampler(train_data)
valid_sampler = SequentialSampler(valid_data)
test_sampler = SequentialSampler(test_data)


train_loader = DataLoader(dataset=train_data, sampler=train_sampler, batch_size=64)
valid_loader = DataLoader(dataset=test_data, sampler=valid_sampler, batch_size=64)
test_loader = DataLoader(dataset=test_data, sampler=test_sampler, batch_size=64)

In [12]:
for i, (data, label, l) in enumerate(train_loader):
    print(i, data, label, l)
    break

0 tensor([[11730, 26770, 18461,  ...,     0,     0,     0],
        [19746, 11526,    14,  ...,     0,     0,     0],
        [28600,  8174, 19746,  ...,     0,     0,     0],
        ...,
        [24214, 27166, 15487,  ...,     0,     0,     0],
        [12674, 24043, 13625,  ...,     0,     0,     0],
        [16037, 14720, 11520,  ...,     0,     0,     0]]) tensor([1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 0, 0, 2, 1, 2, 2, 0, 2,
        2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2]) tensor([118,  39, 421, 500,  71, 310,  39, 208, 146, 141,  67,  43, 235, 216,
        117, 283, 166,  86,  96,  74,  34, 299, 119, 109,  60, 140, 179, 149,
        260, 326,  78,  43,  12, 166, 166, 187,  77,  84,  42,  61, 108,  84,
        280,  53, 280, 180,  84,  31,  89,  68, 147,  38, 306, 192,  98, 196,
        215, 102,  53, 180, 181,  89, 115,  83])


In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class one_hot_LSTM(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(hidden_dim, 3)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        #x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s.to('cpu'), batch_first=True, enforce_sorted=False)
        pack_out, (ht, ct) = self.lstm(x_pack)
        out = self.dropout(ht[-1])
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.linear(out)
        return out

Using cuda device


In [14]:
model = one_hot_LSTM(vocab_size, 100, 128).to(device) # move it to the GPU

In [15]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, seq_len) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X, seq_len)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [16]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y, seq_len in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X, seq_len)
            #print(X.shape, seq_len.shape)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [17]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [18]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(valid_loader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 1.063174  [    0/ 8000]
loss: 0.758304  [ 6400/ 8000]
Test Error: 
 Accuracy: 66.6%, Avg loss: 0.785831 

Epoch 2
-------------------------------
loss: 0.659258  [    0/ 8000]
loss: 0.509790  [ 6400/ 8000]
Test Error: 
 Accuracy: 68.7%, Avg loss: 0.716768 

Epoch 3
-------------------------------
loss: 0.403753  [    0/ 8000]
loss: 0.515171  [ 6400/ 8000]
Test Error: 
 Accuracy: 71.6%, Avg loss: 0.653244 

Epoch 4
-------------------------------
loss: 0.418606  [    0/ 8000]
loss: 0.314757  [ 6400/ 8000]
Test Error: 
 Accuracy: 73.2%, Avg loss: 0.668633 

Epoch 5
-------------------------------
loss: 0.364953  [    0/ 8000]
loss: 0.376593  [ 6400/ 8000]
Test Error: 
 Accuracy: 72.2%, Avg loss: 0.706388 

Epoch 6
-------------------------------
loss: 0.464545  [    0/ 8000]
loss: 0.562652  [ 6400/ 8000]
Test Error: 
 Accuracy: 74.4%, Avg loss: 0.639101 

Epoch 7
-------------------------------
loss: 0.346311  [    0/ 8000]
loss: 0.225122  [ 

In [19]:
model.eval()

y_pred, y_true = [], []

with torch.no_grad(): # https://pytorch.org/docs/stable/generated/torch.no_grad.html
    for X, y, seq_len in test_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X, seq_len)
        for i, pred in enumerate(preds):
            predicted, actual = target_classes[pred.argmax(0)], target_classes[y[i]]
            #print(f'Predicted: "{predicted}", Actual: "{actual}"')
            y_pred.append(predicted); y_true.append(actual)

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true, y_pred)

0.758

## LSTM with pretrained Glove word embeddings

In [21]:
def load_glove_vectors(glove_file="/nfs/nas-6.1/azyen/backup/Data/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [22]:
def get_emb_matrix(pretrained, words, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    W = np.zeros((len(words), emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    i = 2
    for word in words[2:]:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        i += 1
    return W

In [23]:
word_vecs = load_glove_vectors()
pretrained_weights = get_emb_matrix(word_vecs, words)

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class pretrain_weight_LSTM(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pretrained_weights, bidirectional) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(pretrained_weights))
        self.embeddings.weight.requires_grad = True ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = 1,
                            bidirectional = bidirectional,
                            dropout = 0.1,
                            batch_first = True
                           )
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(hidden_dim, 3)
        
    def forward(self, x, s):
        embedded = self.embeddings(x)
        x_pack = pack_padded_sequence(embedded, s.to('cpu'), batch_first=True, enforce_sorted=False)
        pack_out, (ht, ct) = self.lstm(x_pack)
        out = self.dropout(ht[-1])
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.linear(out)
        return out

Using cuda device


In [25]:
bidirectional = False
model2 = pretrain_weight_LSTM(vocab_size, 50, 128, pretrained_weights, bidirectional).to(device) # move it to the GPU



In [26]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=1e-3)

In [27]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model2, loss_fn, optimizer)
    test(valid_loader, model2, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 1.103427  [    0/ 8000]
loss: 0.731244  [ 6400/ 8000]
Test Error: 
 Accuracy: 66.6%, Avg loss: 0.777358 

Epoch 2
-------------------------------
loss: 0.754345  [    0/ 8000]
loss: 0.578984  [ 6400/ 8000]
Test Error: 
 Accuracy: 74.7%, Avg loss: 0.682352 

Epoch 3
-------------------------------
loss: 0.506921  [    0/ 8000]
loss: 0.524786  [ 6400/ 8000]
Test Error: 
 Accuracy: 75.1%, Avg loss: 0.830594 

Epoch 4
-------------------------------
loss: 0.599978  [    0/ 8000]
loss: 0.514930  [ 6400/ 8000]
Test Error: 
 Accuracy: 76.8%, Avg loss: 0.660836 

Epoch 5
-------------------------------
loss: 0.356123  [    0/ 8000]
loss: 0.424640  [ 6400/ 8000]
Test Error: 
 Accuracy: 76.3%, Avg loss: 0.595983 

Epoch 6
-------------------------------
loss: 0.288305  [    0/ 8000]
loss: 0.321225  [ 6400/ 8000]
Test Error: 
 Accuracy: 72.2%, Avg loss: 0.665494 

Epoch 7
-------------------------------
loss: 0.623051  [    0/ 8000]
loss: 0.288191  [ 

In [28]:
model2

pretrain_weight_LSTM(
  (embeddings): Embedding(29593, 50, padding_idx=0)
  (lstm): LSTM(50, 128, batch_first=True, dropout=0.1)
  (relu): ReLU()
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=128, out_features=3, bias=True)
)

In [29]:
model.eval()

y_pred, y_true = [], []

with torch.no_grad(): # https://pytorch.org/docs/stable/generated/torch.no_grad.html
    for X, y, seq_len in test_loader:
        X, y = X.to(device), y.to(device)
        preds = model2(X, seq_len)
        for i, pred in enumerate(preds):
            predicted, actual = target_classes[pred.argmax(0)], target_classes[y[i]]
            #print(f'Predicted: "{predicted}", Actual: "{actual}"')
            y_pred.append(predicted); y_true.append(actual)

In [30]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true, y_pred)

0.791

## bidirectional lstm

In [42]:
bidirectional = True
model3 = pretrain_weight_LSTM(vocab_size, 50, 128, pretrained_weights, bidirectional).to(device) # move it to the GPU



In [43]:
model3

pretrain_weight_LSTM(
  (embeddings): Embedding(29593, 50, padding_idx=0)
  (lstm): LSTM(50, 128, batch_first=True, dropout=0.1, bidirectional=True)
  (relu): ReLU()
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=128, out_features=3, bias=True)
)

In [44]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model3.parameters(), lr=1e-3)

In [45]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model3, loss_fn, optimizer)
    test(valid_loader, model3, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 1.089162  [    0/ 8000]
loss: 0.589003  [ 6400/ 8000]
Test Error: 
 Accuracy: 67.5%, Avg loss: 1.078874 

Epoch 2
-------------------------------
loss: 0.803792  [    0/ 8000]
loss: 0.704882  [ 6400/ 8000]
Test Error: 
 Accuracy: 66.6%, Avg loss: 0.889824 

Epoch 3
-------------------------------
loss: 0.730953  [    0/ 8000]
loss: 0.494237  [ 6400/ 8000]
Test Error: 
 Accuracy: 66.6%, Avg loss: 0.708158 

Epoch 4
-------------------------------
loss: 0.526666  [    0/ 8000]
loss: 0.855531  [ 6400/ 8000]
Test Error: 
 Accuracy: 74.6%, Avg loss: 0.653518 

Epoch 5
-------------------------------
loss: 0.474813  [    0/ 8000]
loss: 0.379566  [ 6400/ 8000]
Test Error: 
 Accuracy: 70.8%, Avg loss: 0.696465 

Epoch 6
-------------------------------
loss: 0.664392  [    0/ 8000]
loss: 0.467682  [ 6400/ 8000]
Test Error: 
 Accuracy: 75.3%, Avg loss: 0.640534 

Epoch 7
-------------------------------
loss: 0.562328  [    0/ 8000]
loss: 0.304531  [ 

In [46]:
model.eval()

y_pred, y_true = [], []

with torch.no_grad(): # https://pytorch.org/docs/stable/generated/torch.no_grad.html
    for X, y, seq_len in test_loader:
        X, y = X.to(device), y.to(device)
        preds = model3(X, seq_len)
        for i, pred in enumerate(preds):
            predicted, actual = target_classes[pred.argmax(0)], target_classes[y[i]]
            #print(f'Predicted: "{predicted}", Actual: "{actual}"')
            y_pred.append(predicted); y_true.append(actual)

In [47]:
accuracy_score(y_true, y_pred)

0.792