In [1]:
import pandas as pd
from string import punctuation
import numpy as np
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import json
import nltk

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

#https://learning.oreilly.com/library/view/hands-on-natural-language/9781789802740/B12365_05_Final_JC_ePub.xhtml

In [2]:
with open("data_sentiment/sentiment.txt") as f:
    reviews = f.read()
    
data = pd.DataFrame([review.split('\t') for review in reviews.split('\n')])

data.columns = ['Review','Sentiment']

In [3]:
data.head()
data.shape
data = data.sample(frac=1)
data.shape

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


(3000, 2)

(3000, 2)

## what is str.maketrans

Parameter	Description

- x	Required. If only one parameter is specified, this has to be a dictionary describing how to perform the replace. If two or more parameters are specified, this parameter has to be a string specifying the characters you want to replace.
- y	Optional. A string with the same length as parameter x. Each character in the first parameter will be replaced with the corresponding character in this string.
- z	Optional. A string describing which characters to remove from the original string.



In [None]:
%%script false --no-raise-error
## Example
txt = "Good night Sam!";
x = "Sam";
y = "Joe";
z = "odnght";


mytable = str.maketrans('', '', z);

print( mytable)

print(txt.translate(mytable));

mytable = str.maketrans(x, y, z);
print(txt.translate(mytable));



In [4]:
def split_words_reviews(data):
    text = list(data['Review'].values)
    clean_text = []
    
    # for each sentence, remove punctutations, lower, remove training space, and add 
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
        
        
    # now each sentence is convtered to ['it', 'really', 'created', 'a', 'unique', 'feeling', 'though']
    # tokenized is a list of list of tokens
    tokenized = [word_tokenize(x) for x in clean_text]
    
    # all_text is mainly used for creating set
    all_text = []
    for tokens in tokenized:
        for t in tokens:
            all_text.append(t)
    return tokenized, set(all_text)

reviews, vocab = split_words_reviews(data)

reviews[0]

# this create a dictionary of 'elf': 3 and 3: 'elf'

['if',
 'i',
 'take',
 'a',
 'picture',
 'the',
 'battery',
 'drops',
 'a',
 'bar',
 'and',
 'starts',
 'beeping',
 'letting',
 'me',
 'know',
 'its',
 'dieing']

In [5]:
def create_dictionaries(words):
    word_to_int_dict = {w:i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i:w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict

word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)

int_to_word_dict[0] = ''
word_to_int_dict[''] = 0

## average lenght and max lenght of a sentence

np.max([len(i) for i in reviews])
np.mean([len(i) for i in reviews])

70

11.783666666666667

In [6]:
## Pad it:   If text lenght is less than 50, then prepend it with '' to make it 50

def pad_text(tokenized_reviews, seq_length):
    
    reviews = []
    
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length-len(review)) + review)
        
    return np.array(reviews)

padded_sentences = pad_text(reviews, seq_length = 50)

padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'if',
       'i', 'take', 'a', 'picture', 'the', 'battery', 'drops', 'a', 'bar',
       'and', 'starts', 'beeping', 'letting', 'me', 'know', 'its',
       'dieing'], dtype='<U33')

In [8]:
## convert word to int and then to np.array

encoded_sentences = np.array([[word_to_int_dict[word] for word in review] for review in padded_sentences])

# print first encoded sentences
encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 3315,
       3702, 2224,  350, 4446, 1331, 2338, 2059,  350, 4206, 4443, 4379,
       3871, 1656, 4845, 4461, 3605, 1800])

In [None]:
%%script false --no-raise-error
# search 'Parameters for LSTM to train' in oneNote
class MSentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  
        self.n_layers = n_layers 
        self.n_hidden = n_hidden 
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        


mnet = MSentimentLSTM(n_vocab=5401, n_embed=50, n_hidden=100, n_output=1, n_layers=1)
a = list(mnet.parameters())
len(a)

for i in mnet.named_parameters():
    print(i[0])

$$
\begin{array}{ll} \\
i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t * c_{(t-1)} + i_t * g_t \\
h_t = o_t * \tanh(c_t) \\
\end{array}
$$

For input data size 1 vector_size, 1 hidden size,
- 4w input size + 4 bias and 4 previous_hidden_W + 4_bias = total 16 
For input data size 1 vector_size, 2 hidden size,
- 8w input size + 8 bias and 8 previous_hidden_W + 8_bias = total 32 

For input data size 2 vector_size, 1 hidden size,
- 8w input size + 4 bias and 4 previous_hidden_W + 8_bias = total 20



where  `h_t` is the hidden state at time `t`,  `c_t` is the cell
state at time `t`,  `x_t` is the input at time `t`,  `h_{(t-1)}`
is the hidden state of the layer at time `t-1` or the initial hidden
state at time `0`, and  `i_t`,  `f_t`,  `g_t`,
 `o_t` are the input, forget, cell, and output gates, respectively.
 `\sigma` is the sigmoid function, and  `*` is the Hadamard product.


In a multilayer LSTM, the input  `x^{(l)}_t` of the  `l` -th layer
( `l >= 2`) is the hidden state  `h^{(l-1)}_t` of the previous layer multiplied by
dropout  `\delta^{(l-1)}_t` where each  `\delta^{(l-1)}_t` is a Bernoulli random
variable which is  `0` with probability :attr:`dropout`.

In [None]:
%%script false --no-raise-error
lstm_l = nn.LSTM(input_size=2, hidden_size=1, num_layers=1, batch_first = True)
a = list(lstm_l.parameters())
len(a)

    
print()

for i in lstm_l.named_parameters():
    if 'weight_ih' in i[0]:
        print('3-gates+1NC for input data: i_gate, f_gate, o_gate, nc_creation:',i[0], i[1].shape)
    if 'weight_hh' in i[0]:
        print('3-gates+1NC weights for previous hidden data: i_gate, f_gate, o_gate, nc_creation:',i[0], i[1].shape)
    
    if 'bias_ih' in i[0]:
        print('bias for input data:', i[0], i[1].shape)
    if 'bias_hh' in i[0]:
        print('bias for previous hidden data:', i[0], i[1].shape)


print()

for i in lstm_l.named_parameters():
    print(i[0], i[1].shape)

print()

for i in lstm_l.named_parameters():
    print(i[0], i[1])

In [20]:
# get labels in an array
# then number of rows in train and validation
labels = np.array([int(x) for x in data['Sentiment'].values])

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2

total = len(encoded_sentences)
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

total, int(total * train_ratio), (1 - valid_ratio), int(total * (1 - valid_ratio))

(3000, 2400, 0.9, 2700)

In [21]:
## now split data
train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(labels[train_cutoff : valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

#DataLoader allows us to batch process our datasets with the batch_size parameter, 
# allowing different batch sizes to be easily passed to our model.
#In this instance, we will keep it simple and set batch_size = 1, which means our model will be trained on 
# individual sentences, rather than using larger batches of data. 

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

## Get the embedding data

In [22]:
## loop unrolling
for inputs, labels in train_loader:
    break

print(f'input data shape={inputs.shape}\nlabel shape={labels.shape}')

inputs # one sentence
labels

input data shape=torch.Size([1, 50])
label shape=torch.Size([1])


tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         3431, 2349]])

tensor([1])

In [28]:
# Input: 1 sentence of 60 words with pads. torch.Size([1, 60])
# Output: 1 sentence 60 words and  50 dims for each word
n_embed = 40
print('length of one sentences ', inputs.shape)
len(word_to_int_dict)
embedding = nn.Embedding(len(word_to_int_dict), n_embed) # number of embedding dimension torch.Size([1, 60, 50])
embedded_words = embedding(inputs)

inputs.shape #torch.Size([1, 60])
embedded_words.shape # torch.Size([1, 60, 40])

length of one sentences  torch.Size([1, 50])


5401

torch.Size([1, 50])

torch.Size([1, 50, 40])

### About Parameters

In [95]:
lstm = nn.LSTM(input_size=4, hidden_size=10, num_layers=3, batch_first = True)
len(list(lstm.parameters()))
#lstm.state_dict()

for i in lstm.named_parameters():
    if 'weight_ih' in i[0]:
        print('Weights for Input data')
        print('Weights for 3-gates+1_New_Context for input data:\n\t\t i_gate, f_gate, o_gate, nc_creation:',i[0], i[1].shape)
        print()
    if 'weight_hh' in i[0]:
        print('Weights for previous hidden data')
        print('3-gates+1_New_Context weights for previous hidden data:\n\t\t i_gate, f_gate, o_gate, nc_creation:',i[0], i[1].shape)
        print()
        
    if 'bias_ih' in i[0]:
        print('bias for input data:', i[0], i[1].shape)
    if 'bias_hh' in i[0]:
        print('bias for previous hidden data:', i[0], i[1].shape)


print()

for i in lstm.named_parameters():
    print(i[0], i[1].shape)

print()

for i in lstm.named_parameters():
    print(i[0], i[1])

12

Weights for Input data
Weights for 3-gates+1_New_Context for input data:
		 i_gate, f_gate, o_gate, nc_creation: weight_ih_l0 torch.Size([40, 4])

Weights for previous hidden data
3-gates+1_New_Context weights for previous hidden data:
		 i_gate, f_gate, o_gate, nc_creation: weight_hh_l0 torch.Size([40, 10])

bias for input data: bias_ih_l0 torch.Size([40])
bias for previous hidden data: bias_hh_l0 torch.Size([40])
Weights for Input data
Weights for 3-gates+1_New_Context for input data:
		 i_gate, f_gate, o_gate, nc_creation: weight_ih_l1 torch.Size([40, 10])

Weights for previous hidden data
3-gates+1_New_Context weights for previous hidden data:
		 i_gate, f_gate, o_gate, nc_creation: weight_hh_l1 torch.Size([40, 10])

bias for input data: bias_ih_l1 torch.Size([40])
bias for previous hidden data: bias_hh_l1 torch.Size([40])
Weights for Input data
Weights for 3-gates+1_New_Context for input data:
		 i_gate, f_gate, o_gate, nc_creation: weight_ih_l2 torch.Size([40, 10])

Weights for p

In [58]:
weights = next(lstm.parameters()).data
h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
     weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))


NameError: name 'self' is not defined

In [105]:
t = next(lstm.parameters()).data
type(t)
t.shape

torch.Tensor

torch.Size([40, 4])

### About LSTM output

In [38]:
my_hidden_size = 100
lstm = nn.LSTM(input_size=40, #the is the size of the embedding per word, not the length of sentence. 
               hidden_size = my_hidden_size, 
               num_layers = 1, batch_first = True)

lstm_out, h = lstm(embedded_words)# we could provide initialized value of hidden state and cell state

# output contains hidden_size vector generated for each input word of the sentence
# input was: torch.Size([1, 60, 50])
# output is: torch.Size([1, 60, 100])
print('lstm_output= ', lstm_out.shape) #(seq_len, batch, num_directions * hidden_size)


# these are the output of hidden vector and cell vector
# There is one output for each layer
# The output is restricted to last time step or for the last word
# h_n and c_n of each layer of the last time step t=seq_len


#h[0].shape # hidden # torch.Size([1, 1, 100])
#h[1].shape # cell   # torch.Size([1, 1, 100])


print('hidden_state = ', h[0].shape) # (num_layers * num_directions, batch, hidden_size): torch.Size([1, 1, 100])
print('cell_state = ', h[1].shape) # (num_layers * num_directions, batch, hidden_size): torch.Size([1, 1, 100])


sum(sum(lstm_out[0,-1] == h[0][-1])) # the last output lstm_ matches with hidden output of the last layer

lstm_output=  torch.Size([1, 50, 100])
hidden_state =  torch.Size([1, 1, 100])
cell_state =  torch.Size([1, 1, 100])


tensor(100)

In [39]:

lstm_out.shape
lstm_out.contiguous().view(-1, my_hidden_size).shape

torch.Size([1, 50, 100])

torch.Size([50, 100])

In [46]:
#linear layer
my_output = 1
fc = nn.Linear(my_hidden_size, my_output)
fc_output = fc(lstm_out)
fc_output.shape

torch.Size([1, 50, 1])

In [50]:
sigmoid = nn.Sigmoid()
sigmoid_out = sigmoid(fc_output) 
sigmoid_out.shape
my_batch_size = 1
sigmoid_out.view(my_batch_size, -1).shape

torch.Size([1, 50, 1])

torch.Size([1, 50])

In [53]:
sigmoid_out[:, -1]

tensor([[0.5187]], grad_fn=<SelectBackward>)

In [108]:
#https://learning.oreilly.com/library/view/hands-on-natural-language/9781789802740/B12365_05_Final_JC_ePub.xhtml

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  # num of input to first layer
        self.n_layers = n_layers  # #number of LSTM layers
        self.n_hidden = n_hidden  # size of vector of hidden output# 'LSTM Hidden units' in onenote
        
        self.embedding = nn.Embedding(n_vocab, n_embed) # number of embedding dimension
        
                            
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                          
        embedded_words = self.embedding(input_words) # vect size of unique words len. convert to embed size
        lstm_out, h = self.lstm(embedded_words) # 
        lstm_out = self.dropout(lstm_out) # shape is torch.Size([1, 50, 100])
        # following line will convert (1 sentence, 50 embeded vector, 100 hidden output for each embed word)
        # to (50 embeded vector, 100 hidden output for each embed word)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # torch.Size([50, 100])
        fc_out = self.fc(lstm_out)    # torch.Size([1, 50, 1])               
        sigmoid_out = self.sigmoid(fc_out)   # torch.Size([1, 50, 1])             
        sigmoid_out = sigmoid_out.view(batch_size, -1)   # torch.Size([1, 50]) here batch_size is 1
        
        sigmoid_last = sigmoid_out[:, -1] # out of n_embed values, pick the last one
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):
        
        device = "cpu"
        weights = next(self.parameters()).data # total length will 4_set_of_parameters x layers
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

In [109]:

n_vocab = len(word_to_int_dict)
n_embed = 50 # the final number of dimension in embedded vector
n_hidden = 100
n_output = 1
n_layers = 2

sentiment_lstm = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
criterion = nn.BCELoss()
optimizer = optim.Adam(sentiment_lstm.parameters(), lr = 0.001)

In [113]:
#Most importantly, we define our loss function as binary cross entropy (as we are dealing 
# with predicting a single binary class) and we define our optimizer to be Adam with 
# a learning rate of 0.001. We also define our model to run for a short number of epochs 
# (to save time) and set clip = 5 to define our gradient clipping:

print_every = 2400
step = 0
n_epochs = 1
clip = 5  


In [114]:
for epoch in range(n_epochs):
    print('Epoch ==================== ',epoch)
    #hidden_initialized = sentiment_lstm.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1  
        sentiment_lstm.zero_grad()
        output, h = sentiment_lstm(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        _= nn.utils.clip_grad_norm(sentiment_lstm.parameters(), clip);
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval();
            valid_losses = []

            for v_inputs, v_labels in valid_loader:
                       
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("\t\t\tEpoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()



  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  # This is added back by InteractiveShellApp.init_path()


NameError: name 'net' is not defined

In [None]:
# torch.save(net.state_dict(), 'model.pkl')
#net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
#net.load_state_dict(torch.load('model.pkl'))

In [None]:
net.eval()
test_losses = []
num_correct = 0

for inputs, labels in test_loader:

    test_output, test_h = net(inputs)
    loss = criterion(test_output, labels)
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

In [None]:
def preprocess_review(review):
    review = review.translate(str.maketrans('', '', punctuation)).lower().rstrip()
    tokenized = word_tokenize(review)
    if len(tokenized) >= 50:
        review = tokenized[:50]
    else:
        review= ['0']*(50-len(tokenized)) + tokenized
    
    final = []
    
    for token in review:
        try:
            final.append(word_to_int_dict[token])
            
        except:
            final.append(word_to_int_dict[''])
        
    return final

In [None]:
def predict(review):
    net.eval()
    words = np.array([preprocess_review(review)])
    padded_words = torch.from_numpy(words)
    pred_loader = DataLoader(padded_words, batch_size = 1, shuffle = True)
    for x in pred_loader:
        output = net(x)[0].item()
    
    msg = "This is a positive review." if output >= 0.5 else "This is a negative review."
    print(msg)
    print('Prediction = ' + str(output))

In [None]:
predict("The film was good")

In [None]:
predict("It was not good")