# 0- Import packages

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors

import gensim

import matplotlib.pyplot as plt1
import matplotlib.pyplot as plt2
import numpy as np

import time


# 1- Import data

In [2]:
torch.backends.cudnn.deterministic = True
LABEL = data.LabelField(dtype = torch.float)
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
fields = [(None, None),(None, None), (None, None), (None, None),(None, None),(None, None), ('label', LABEL),(None, None),(None, None), (None, None),(None, None),('text',TEXT)]
train_data = data.TabularDataset(path = 'data/data_filtered2_balanced_all.csv', format = 'csv', fields = fields, skip_header = True)
print(f'Number of data examples: {len(train_data)}')

train_data, test_data = train_data.split(0.8)
train_data, valid_data = train_data.split(0.8)
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of data examples: 98474
Number of training examples: 63023
Number of validation examples: 15756
Number of testing examples: 19695


# 2- Build vocabulary - Word2Vec

In [3]:
#filename="w2v_reviews_50.text"
filename="w2v_reviews_300.text"

MAX_VOCAB_SIZE = 105_000 #most common words
MIN_FREQ = 2 #minimum frequency not to be unknown
w2v = gensim.models.KeyedVectors.load_word2vec_format(filename)

vectors = Vectors(name=filename, cache = './')

LABEL.build_vocab(train_data)
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE, 
                 min_freq = MIN_FREQ,
                 vectors = vectors,
                 unk_init = torch.Tensor.normal_)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

print(f"Most common words: {TEXT.vocab.freqs.most_common(20)}") #most common words
print(f"Vocabulary:{TEXT.vocab.itos[:10]}") #to see vocabulary
print(f"Labels: {LABEL.vocab.stoi}") #to see labels

Unique tokens in TEXT vocabulary: 49025
Unique tokens in LABEL vocabulary: 2
Most common words: [('.', 294506), (',', 233536), ('the', 217799), ('I', 198213), ('and', 154038), ('a', 144797), ('to', 130387), (' ', 123224), ('it', 108253), ('of', 106295), ('is', 92058), ('in', 67837), ('this', 66616), ('for', 64686), ('that', 58620), ('was', 44397), ('you', 44233), ('my', 44215), ('with', 42964), ('not', 42676)]
Vocabulary:['<unk>', '<pad>', '.', ',', 'the', 'I', 'and', 'a', 'to', ' ']
Labels: defaultdict(<function _default_unk_index at 0x00000233A9DF57B8>, {'1': 0, '0': 1})


# 3- Build iterators

In [4]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

''''train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)'''

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: len(x.text), #sort by s attribute (quote)
    sort_within_batch=True,
    batch_size=BATCH_SIZE,
    device=device)

# 4- Build the model

In [5]:
#1- Define the model

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))
    
#2- Create an instance of the RNN class

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 100
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.9
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

#3-  Trainable parameters

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters') #much more than in version 1

#4- Copy pre trained word embeddings into "embedding" layer of our model

pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape) #check correct size [vocab size, embedding dim]

#5- Replace the initial weights of the "embedding" layer with the "pre-trained embeddings"

model.embedding.weight.data.copy_(pretrained_embeddings)
    
#6 - <unk> and <pad>

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data) #check that first two rows set to zero

The model has 15,270,901 trainable parameters
torch.Size([49025, 300])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


# 5 - Train the model

In [6]:
#0- Set up the plotting figure
%matplotlib inline
%config InlineBackend.figure_format ='svg'
%load_ext autoreload
%autoreload 2
%matplotlib notebook

#1- Set up the optimizer
optimizer = optim.Adam(model.parameters(),weight_decay = 0.0001)
#optimizer = optim.SGD(model.parameters(),lr=1e-3)

#2- Sigmoid and boundary cross entropy
criterion = nn.BCEWithLogitsLoss()

#3- Move them to GPU
model = model.to(device)
criterion = criterion.to(device)

#4- Accuracy function
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

#5- Train function
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#6- Evaluate function
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#7- Epoch running time function

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#8- Train the model
N_EPOCHS = 200

cont = -1
train_loss_vec = np.zeros(N_EPOCHS)
valid_loss_vec = np.zeros(N_EPOCHS)
train_acc_vec = np.zeros(N_EPOCHS)
valid_acc_vec = np.zeros(N_EPOCHS)

best_valid_loss = float('inf')

fig1 = plt1.figure()
ax1 = fig1.add_subplot(111)
fig1.show()
fig1.canvas.draw()

fig2 = plt2.figure()
ax2 = fig2.add_subplot(111)
fig2.show()
fig2.canvas.draw()

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    cont = cont + 1
    train_loss_vec[cont]=train_loss
    valid_loss_vec[cont]=valid_loss
    train_acc_vec[cont]=train_acc
    valid_acc_vec[cont]=valid_acc
    
    ax1.clear()
    ax1.plot(range(1,epoch+2),train_acc_vec[0:cont+1],'-o', label=r'Training accuracy') 
    ax1.plot(range(1,epoch+2),valid_acc_vec[0:cont+1],'-o', label=r'Validation accuracy')
    fig1.legend(loc='best')
    fig1.canvas.draw()
    
    ax2.clear()
    ax2.plot(range(1,epoch+2),train_loss_vec[0:cont+1],'-o', label=r'Training loss') 
    ax2.plot(range(1,epoch+2),valid_loss_vec[0:cont+1],'-o', label=r'Validation loss')
    fig2.legend(loc='best')
    fig2.canvas.draw()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch: 01 | Epoch Time: 3m 49s
	Train Loss: 0.683 | Train Acc: 54.91%
	 Val. Loss: 0.667 |  Val. Acc: 60.27%




Epoch: 02 | Epoch Time: 3m 51s
	Train Loss: 0.652 | Train Acc: 62.51%
	 Val. Loss: 0.650 |  Val. Acc: 62.66%
Epoch: 03 | Epoch Time: 3m 50s
	Train Loss: 0.655 | Train Acc: 61.91%
	 Val. Loss: 0.632 |  Val. Acc: 65.24%
Epoch: 04 | Epoch Time: 3m 51s
	Train Loss: 0.649 | Train Acc: 62.87%
	 Val. Loss: 0.640 |  Val. Acc: 64.04%
Epoch: 05 | Epoch Time: 3m 51s
	Train Loss: 0.647 | Train Acc: 63.65%
	 Val. Loss: 0.634 |  Val. Acc: 65.77%
Epoch: 06 | Epoch Time: 3m 51s
	Train Loss: 0.634 | Train Acc: 65.30%
	 Val. Loss: 0.615 |  Val. Acc: 66.59%
Epoch: 07 | Epoch Time: 3m 51s
	Train Loss: 0.625 | Train Acc: 66.48%
	 Val. Loss: 0.673 |  Val. Acc: 63.98%
Epoch: 08 | Epoch Time: 3m 50s
	Train Loss: 0.644 | Train Acc: 63.98%
	 Val. Loss: 0.661 |  Val. Acc: 62.80%
Epoch: 09 | Epoch Time: 3m 51s
	Train Loss: 0.634 | Train Acc: 65.48%
	 Val. Loss: 0.614 |  Val. Acc: 67.24%
Epoch: 10 | Epoch Time: 3m 51s
	Train Loss: 0.622 | Train Acc: 66.90%
	 Val. Loss: 0.612 |  Val. Acc: 67.93%
Epoch: 11 | Epoch T

KeyboardInterrupt: 

# 6- Final results

In [7]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.559 | Test Acc: 71.97%


# 7- User input

In [29]:
import spacy
nlp = spacy.load('en')

#1- Load model
model.load_state_dict(torch.load('tut2-model.pt'))

#2- Sentiment prediction function
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

#3- Test some sentences
sentence1="Linus is hungry. He loved cars"
out1 = predict_sentiment(model, sentence1) #should be close to 0
print(f'Sentence: {sentence1} | Estimation: {out1}')

sentence2="I love these trail mix bars. They are delicious and healthy and the price is great. To be able to get them delivered each month is a wonderful thing!" 
out2=predict_sentiment(model,sentence2) #should be close to 1
print(f'Sentence: {sentence2} | Estimation: {out2}')

sentence3="I found this exact same product at Whole Foods for 3.99 per bag, and you can also ask them to order a box for you at a ten percent discount.  So I" 
out3=predict_sentiment(model,sentence3) #should be close to 1
print(f'Sentence: {sentence3} | Estimation: {out3}')



Sentence: Linus is hungry. He loved cars | Estimation: 0.4781437814235687
Sentence: I love these trail mix bars. They are delicious and healthy and the price is great. To be able to get them delivered each month is a wonderful thing! | Estimation: 0.310120552778244
Sentence: I found this exact same product at Whole Foods for 3.99 per bag, and you can also ask them to order a box for you at a ten percent discount.  So I | Estimation: 0.4479773938655853


In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)

0.4.1
True
True
