In [11]:
import torch
import numpy as np
from utils import * # data loader

In [12]:
# load data
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=False) # if final is True, train and test set is returned. Else validation data

print('x_train:', len(x_train))
print('y_train:', len(y_train))

x_train: 20000
y_train: 20000


The return values are as follows:

● x_train A python list of lists of integers. Each integer represents a word. Sorted
from short to long.

● y_train The corresponding class labels: 0 for positive, 1 for negative.

● x_val Test/validation data. Laid out the same as x_train.

● y_val Test/validation labels

● i2w A list of strings mapping the integers in the sequences to their original words.
i2w[141] returns the string containing word 141.

● w2i A dictionary mapping the words to their indices. w2i['film'] returns the index
for the word "film".

In [13]:
print(x_train[0]) # each integer represents a word, shorted from short to long
print(y_train[0]) # 0 or 1, 0 means negative, 1 means positive

[14, 19, 9, 379, 22, 11, 50, 52, 53, 290]
1


In [14]:
print(i2w) # index to word (list)
print(w2i) # word to index (dict)



In [15]:
print(w2i['pad']) # uses to fill the sentence to the same length with
print(w2i['start'])
print(w2i['end'])

6818
376
132


In [16]:
for sentence in x_train[:10]:
    for word_index in sentence:
        print(i2w[word_index], end=' ')
    print(len(sentence))

this movie is terrible but it has some good effects 10
i wouldn t rent this one even on dollar rental night 11
ming the merciless does a little bardwork and a movie most foul 12
long boring blasphemous never have i been so glad to see ending credits roll 14
no comment stupid movie acting average or worse screenplay no sense at all skip it 15
smallville episode justice is the best episode of smallville it s my favorite episode of smallville 16
this is the definitive movie version of hamlet branagh cuts nothing but there are no wasted moments 17
a rating of does not begin to express how dull depressing and relentlessly bad this movie is 17
i don t know why i like this movie so well but i never get tired of watching it 19
great movie especially the music etta james at last this speaks volumes when you have finally found that special someone 20


In [17]:
def padding(x, y, w2i, batch_size = 16):
    
    batches_x = []
    batches_y = []
    
    # step over x met steps of batch_size
    for i in range(0, len(x), batch_size):
        
        start = i
        end = i + batch_size
        
        # get the batch
        batch_x = x[start:end]
        batch_y = y[start:end]
        
        
        
        batch = []
        for i, sentence in enumerate(batch_x):
            longest_sentence = max([len(sentence) for sentence in batch_x])
            if len(sentence) < longest_sentence:
                sentence += [w2i['pad']] * (longest_sentence - len(sentence))

            # print(len(sentence))
            batch.append(sentence)
        
        batches_x.append(batch)
        batches_y.append(batch_y)
        
    # transform all batches to tensors
    batches_x = [torch.tensor(batch, dtype = torch.long) for batch in batches_x]
    batches_y = [torch.tensor(batch, dtype = torch.long) for batch in batches_y]
        
    return batches_x, batches_y
                

In [18]:
# create batches
batch_size = 16
batches_x, batches_y = padding(x_train, y_train, w2i, batch_size)

In [19]:
class MLP(torch.nn.Module):
    
    def __init__(self, w2i, embedding_dim = 300, hidden_size = 300):
        super(MLP, self).__init__()
        num_embeddings = len(w2i)
        self.embedding =  torch.nn.Embedding(num_embeddings, embedding_dim)
        self.hidden = torch.nn.Linear(embedding_dim, hidden_size)
        self.output = torch.nn.Linear(hidden_size, 2)

    def forward(self, x):
        emb = self.embedding(x)
        k = self.hidden(emb)
        h = torch.nn.functional.relu(k)
        o, _ = torch.max(h, dim=1)
        y = self.output(o)
        return y 
    

In [27]:
def train(batches_x, batches_y, model, epochs = 5, optimizer = 'Adam', lr=0.001):
    if optimizer == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
    
        losses = []
        accuracies = []
        for i, batch in enumerate(batches_x):
            if i % 100 == 0: print(i)
            predicted_y = model(batch)
            loss = torch.nn.functional.cross_entropy(predicted_y, batches_y[i])
            loss.backward()
            optimizer.step()  
            losses.append(loss.item())
            
            # get index of the max value (0 or 1)
            predicted_y = predicted_y.argmax(dim=1)
            
            # calculate accuracy: number of correct predictions / number of predictions
            n_correct = (predicted_y == batches_y[i]).sum().item()
            accuracy = n_correct / len(predicted_y)
            accuracies.append(accuracy)
            
        print('Epoch: ', epoch, 'Loss: ', np.mean(losses), 'Accuracy: ', np.mean(accuracies))  

In [28]:
model = MLP(w2i)
train(batches_x, batches_y, model, epochs = 10, optimizer = 'Adam', lr=0.001)

0
100
200
300
400
500
600


KeyboardInterrupt: 