In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
from string import punctuation
from collections import Counter, OrderedDict
import numpy as np

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch import optim

In [3]:
def preprocess(text):
    text = text.lower()
    text = "".join([ch for ch in text if ch not in punctuation])
    all_reviews = text.split("\n")
#     text = " ".join(text)
    all_words = text.split()
    
    return all_reviews, all_words


def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:        
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)


In [4]:
with open("/content/gdrive/My Drive/data/reviews.txt") as f:
    reviews = f.read()
    
print(len(reviews))
    
with open("/content/gdrive/My Drive/data/labels.txt") as f:
    labels = f.read()

print(len(labels))

33678267
225000


In [5]:
all_reviews, all_words = preprocess(reviews)

In [6]:
word_counts = Counter(all_words)
# word_list = sorted(word_counts, keys = word_counts.get, reverse = True)
word_list = OrderedDict(word_counts.most_common())
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)}
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()}
encoded_reviews = [[vocab_to_int[word] for word in review if word.strip()!='' ] for review in all_reviews]


In [7]:
all_labels = labels.split("\n")
encoded_labels = [1 if label == "positive" else 0 for label in all_labels]
assert len(encoded_reviews) == len(encoded_labels), "# of encoded reivews & encoded labels must be the same!"

In [8]:
len(encoded_reviews)

25001

In [9]:
len(encoded_labels)

25001

In [10]:
encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

In [11]:
len(encoded_reviews)

25000

In [12]:
len(encoded_labels)

25000

In [13]:
for idx, review in enumerate(encoded_reviews):
    if len(review)<200:
        print(idx)

41
165
393
439
441
460
582
593
600
638
694
747
772
833
837
859
870
897
1026
1059
1093
1305
1308
1319
1342
1484
1488
1541
1542
1621
1714
1733
1794
1838
1868
2130
2137
2329
2360
2410
2418
2433
2444
2598
2666
2671
2714
2929
2992
3466
3504
3505
3542
3744
3759
3836
3895
3899
3910
3965
3969
4020
4053
4122
4150
4154
4161
4262
4274
4321
4333
4335
4434
4506
4570
4573
4615
4703
4721
4766
4784
4853
4872
4918
4958
4998
5050
5126
5128
5130
5193
5229
5297
5324
5380
5496
5500
5509
5559
5561
5572
5574
5639
5802
5857
5961
5962
5964
6115
6127
6159
6173
6210
6226
6228
6236
6238
6267
6354
6464
6495
6548
6568
6628
6825
6897
6917
6953
7242
7358
7362
7374
7451
7524
7636
7638
7640
7644
7804
7806
7815
7915
7959
7963
8020
8059
8185
8191
8262
8394
8450
8452
8553
8560
8709
8721
8724
8742
8770
8890
9266
9294
9304
9338
9360
9516
9558
9567
9599
9756
9789
9886
9898
9906
9992
10008
10047
10119
10142
10223
10323
10369
10399
10477
10542
10582
10594
10680
10711
10764
10824
10844
10998
11091
11103
11166
11182
11368
11406


In [14]:
print(len(encoded_reviews[41]))
print(encoded_reviews[41])

85
[23, 2035, 953, 1013, 2035, 3, 1348, 3, 1013, 23, 953, 1348, 13, 3, 1348, 953, 1166, 1802, 1498, 10, 1416, 953, 3, 511, 1498, 953, 3, 1802, 222, 23, 2035, 953, 13, 1013, 1348, 10, 1713, 23, 10, 13, 3, 2226, 1311, 1166, 1498, 10, 23, 13, 3, 2226, 3, 13, 23, 953, 709, 1311, 23, 2035, 953, 23, 3, 1498, 953, 1802, 23, 13, 709, 1311, 222, 953, 1802, 953, 1166, 1943, 953, 3, 1802, 222, 3, 1166, 23, 953, 1166, 10, 1498]


In [15]:
padded_reviews = pad_text(encoded_reviews, seq_length = 200)

In [16]:
print(len(padded_reviews[41]))
print(padded_reviews[41])

200
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   23 2035  953 1013 2035    3 1348    3 1013   23  953
 1348   13    3 1348  953 1166 1802 1498   10 1416  953    3  511 1498
  953    3 1802  222   23 2035  953   13 1013 1348   10 1713   23   10
   13    3 2226 1311 1166 1498   10   23   13    3 2226    3   13   23
  953  709 1311   23 2035  953   23    3 1498  953 1802   23   13  709
 1311  222  953 1802  953 1166 1943  953    3 1802  222    3 1166   23
  

In [17]:
padded_reviews.shape

(25000, 200)

In [18]:
train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2
total = padded_reviews.shape[0]
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))


train_x, train_y = padded_reviews[:train_cutoff], encoded_labels[:train_cutoff]
valid_x, valid_y = padded_reviews[train_cutoff : valid_cutoff], encoded_labels[train_cutoff : valid_cutoff]
test_x, test_y = padded_reviews[valid_cutoff:], encoded_labels[valid_cutoff:]


In [19]:
train_data = TensorDataset(torch.Tensor(train_x).long(), torch.Tensor(train_y).long())
valid_data = TensorDataset(torch.Tensor(valid_x).long(), torch.Tensor(valid_y).long())
test_data  = TensorDataset(torch.Tensor(test_x).long(), torch.Tensor(test_y).long())

In [20]:
batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [29]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        
        return h


In [34]:
n_vocab = len(vocab_to_int)
n_embed = 400
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

device = "cuda" if torch.cuda.is_available() else "cpu"

net = net.to(device)
criterion = criterion.to(device)

In [35]:
print_every = 100
step = 0
n_epochs = 4  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'


for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()


Epoch: 1/4 Step: 100 Training Loss: 0.6918 Validation Loss: 0.6891
Epoch: 1/4 Step: 200 Training Loss: 0.7094 Validation Loss: 0.7006
Epoch: 1/4 Step: 300 Training Loss: 0.6958 Validation Loss: 0.6926
Epoch: 1/4 Step: 400 Training Loss: 0.6892 Validation Loss: 0.6876
Epoch: 2/4 Step: 500 Training Loss: 0.6964 Validation Loss: 0.6987
Epoch: 2/4 Step: 600 Training Loss: 0.6929 Validation Loss: 0.6897
Epoch: 2/4 Step: 700 Training Loss: 0.6848 Validation Loss: 0.6846
Epoch: 2/4 Step: 800 Training Loss: 0.6936 Validation Loss: 0.6903
Epoch: 3/4 Step: 900 Training Loss: 0.6865 Validation Loss: 0.6903
Epoch: 3/4 Step: 1000 Training Loss: 0.6926 Validation Loss: 0.6902
Epoch: 3/4 Step: 1100 Training Loss: 0.6917 Validation Loss: 0.6900
Epoch: 3/4 Step: 1200 Training Loss: 0.6822 Validation Loss: 0.6797
Epoch: 4/4 Step: 1300 Training Loss: 0.6836 Validation Loss: 0.6924
Epoch: 4/4 Step: 1400 Training Loss: 0.6836 Validation Loss: 0.6759
Epoch: 4/4 Step: 1500 Training Loss: 0.6835 Validation Lo

In [36]:
torch.cuda.is_available()

True

In [37]:
test_loader

<torch.utils.data.dataloader.DataLoader at 0x7fcfd7b429e8>

In [38]:
len(test_data)

2500