In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
class Vocab(object):
    
    def __init__(self, filename):
        self.idx_to_word = {}
        self.word_to_idx = {}
        self.filename = filename
#         self.num_words = num_words
        self.unk_vec = None
        self.dim = None
        
        USE_CUDA = torch.cuda.is_available()
        self.device = torch.device("cuda" if USE_CUDA else "cpu")
        with open(filename) as f:
            idx = 0
            for line in f:
                line = line.split()
                self.idx_to_word[idx] = line[0]
                self.word_to_idx[line[0]] = idx
                if not self.dim:
                    self.dim = len(line[1:])
                idx += 1
        
        self.embedding_matrix = torch.zeros(len(self.idx_to_word)+2, self.dim, device=self.device)
        
        with open(filename) as f:
            idx = 1;
            for line in f:
                line = line.split()
                self.embedding_matrix[idx] = torch.tensor(list(map(float, line[1:])), device=self.device)
                idx += 1
            self.unk_vec = torch.sum(self.embedding_matrix, 0)/(len(self.idx_to_word))
            self.embedding_matrix[len(self.idx_to_word)+1] = self.unk_vec
        
    def embedding(self, input_seq):
        MAX_LEN = input_seq.size()[0]
        batch_size = input_seq.size()[1]
        embedded = torch.zeros(MAX_LEN, batch_size, self.dim, device=self.device)
        for i in range(MAX_LEN):
            for j in range(batch_size):
                embedded[i,j,:] = self.embedding_matrix[input_seq[i, j]]
        return embedded
    
    def encode(self, sentence):
        encoded = torch.zeros(len(sentence), dtype=torch.long, device=self.device)
        idx=0
        for word in sentence:
            if word in self.word_to_idx:
                encoded[idx] = self.word_to_idx[word]
            else:
                encoded[idx] = len(self.word_to_idx)+1
            idx += 1
        
        return encoded
    
    def decode(self, sentence):
        decoded = torch.zeros(*sentence.size(), device=self.device)
        idx = 0
        for word in sentence:
            decoded[idx] = self.idx_to_word[word]
        
        return decoded
    

In [3]:
def batcher(list_sentence, MAX_LEN, batch_size, test_ratio=0.2):
    
    USE_CUDA = torch.cuda.is_available()
    device = torch.device("cuda" if USE_CUDA else "cpu")
    total = len(list_sentence)
    train_len = total*(1-test_ratio)
    test_len = total - train_len
    train_sentence = list_sentence[:int(train_len)]
    test_sentence = list_sentence[int(train_len):]
    train = []
    test = []
    idx = int(0)
    
    while True:
        pos = int(idx)
        if pos == train_len:
            break;
        if (idx + batch_size <= train_len):
            next_pos = int(idx+batch_size)
        else:
            next_pos = train_len
        t_batch = torch.zeros(MAX_LEN, int(next_pos - pos), dtype=torch.long, device=device)
        tmp_batch = train_sentence[int(pos):int(next_pos)]
        target = torch.zeros(int(next_pos - pos), dtype=torch.long, device=device)
        length = []
        mask = torch.zeros(MAX_LEN, int(next_pos-pos), dtype=torch.long, device=device)
        for batch_n in range(int(next_pos - pos)):
            l = MAX_LEN if (len(tmp_batch[batch_n])-1) > MAX_LEN else (len(tmp_batch[batch_n])-1)
            length.append(l)
            target[batch_n] = tmp_batch[batch_n][-1]
            t_batch[:l, batch_n] = tmp_batch[batch_n][:l]
            mask[:l, batch_n] = torch.ones(l)
        train.append((t_batch, target, mask, length))
        idx = next_pos
        
    idx = int(0)
    
    while True:
        pos = idx
        if pos == test_len:
            break;
        if (idx + batch_size <= test_len):
            next_pos = idx+batch_size
        else:
            next_pos = test_len
        t_batch = torch.zeros(MAX_LEN, int(next_pos - pos), dtype=torch.long, device=device)
        tmp_batch = test_sentence[int(pos):int(next_pos)]
        target = torch.zeros(int(next_pos - pos), dtype=torch.long, device=device)
        length = []
        mask = torch.zeros(MAX_LEN, int(next_pos-pos), dtype=torch.long, device=device)
        for batch_n in range(int(next_pos - pos)):
            l = MAX_LEN if (len(tmp_batch[batch_n])-1) > MAX_LEN else (len(tmp_batch[batch_n])-1)
            length.append(l)
            target[batch_n] = tmp_batch[batch_n][-1]
            t_batch[:l, batch_n] = tmp_batch[batch_n][:l]
            mask[:l, batch_n] = torch.ones(l)
        test.append((t_batch, target, mask, length))
        idx = next_pos
        
    return train, test


In [4]:
import re

In [5]:
data_file = 'sentiment/imdb_labelled.txt'
embedding_file = 'glove.6B.50d.txt'

X = []
vocab = Vocab(embedding_file)


In [6]:
import io
X = []
with io.open(data_file, encoding='utf-8', errors='ignore') as f:
    idx = 0
    for line in f:
        line = line.lower()
        if idx%18 == 0:
            print(line)
        line = re.sub(r"[()\"#/@;:<>{}`+=~|.!?,]", "", line)
        if idx%18 == 0:
            print(line)
        new_line = line.split()
        new_line = [word.strip() for word in new_line if word != '']
        tokenized_line = vocab.encode(new_line[:-1])
#         print(new_line[-1])
        X.append(torch.cat((tokenized_line, torch.tensor([float(new_line[-1])], dtype=torch.long, device=device)), dim=0))
        idx += 1
# train, test = batcher(X, 15, 64)

a very, very, very slow-moving, aimless movie about a distressed, drifting young man.  	0

a very very very slow-moving aimless movie about a distressed drifting young man  	0

it's practically perfect in all of them  a true masterpiece in a sea of faux "masterpieces.  	1

it's practically perfect in all of them  a true masterpiece in a sea of faux masterpieces  	1

this game rocks.  	1

this game rocks  	1

long, whiny and pointless.  	0

long whiny and pointless  	0

it crackles with an unpredictable, youthful energy - but honestly, i found it hard to follow and concentrate on it meanders so badly.  	0

it crackles with an unpredictable youthful energy - but honestly i found it hard to follow and concentrate on it meanders so badly  	0

definitely worth checking out.  	1

definitely worth checking out  	1

an hour and a half i wish i could bring back.  	0

an hour and a half i wish i could bring back  	0

i liked this movie way too much.  	1

i liked this movie way too much  	1

th

In [7]:
train, test = batcher(X, 15, 64)

In [8]:
X[0]

tensor([    7,   191,   191,   191, 81023, 57837,  1005,    59,     7, 16108,
        17674,   461,   300,     0], device='cuda:0')

In [9]:
print(train[0][0][:,:3], train[0][1][:3], train[0][2][:,:3], train[0][3][:3])

tensor([[     7,     36,   4997],
        [   191,   1085, 307248],
        [   191,     38,     17],
        [   191,     15,    521],
        [ 81023,     56,   1161],
        [ 57837,    402,    298],
        [  1005,     11,      5],
        [    59,      0,  11114],
        [     7,   2983,   3534],
        [ 16108,   2153,  13352],
        [ 17674,     46,      0],
        [   461,      0,   1005],
        [   300,   2052,   4254],
        [     0,    573,     11],
        [     0,    343,    302]], device='cuda:0') tensor([0, 0, 0], device='cuda:0') tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [0, 1, 1],
        [0, 1, 1]], device='cuda:0') [13, 15, 15]


In [18]:
from classes import ELMo, biLM

In [19]:
model_ELMo = ELMo(50, vocab.embedding, n_layers=2)

In [20]:
class Task(nn.Module):
    def __init__(self, input_size, output_size):
        super(Task, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.layer = nn.LSTM(input_size, output_size)
        
    def forward(self, input_seq, input_lengths):
        
        batch_size = input_seq.size()[1]
        outputs = torch.zeros(batch_size, self.output_size)
        for i in range(batch_size):
#             hidden = (torch.zeros(), None)
            output, _ = self.layer(input_seq[:input_lengths[i], i, :].unsqueeze(0))
            outputs[i] = output[-1,0,:]
        return outputs
    

In [21]:
model_task = Task(50, 2)


In [22]:
model_ELMo = model_ELMo.to(device)
model_task = model_task.to(device)

In [23]:
clip = 50.0
learning_rate = 0.002
n_iteration = 30
print_every = 1

model_ELMo.train()
model_task.train()

ELMo_opt = optim.Adam(model_ELMo.parameters(), lr=learning_rate)
task_opt = optim.Adam(model_task.parameters(), lr=learning_rate)


In [24]:
list_loss = []
accuracy = []

def make_confusion_matrix(true, pred):
    K = len(np.unique(true)) # Number of classes 
    result = np.zeros((K, K))

    for i in range(len(true)):
        result[true[i]][pred[i]] += 1

    return result
train_pred = []
train_true = []
test_pred = []
test_true = []

In [25]:
n_iteration = 10
for iteration in range(n_iteration):
    for batch in train:
        ELMo_opt.zero_grad()
        task_opt.zero_grad()
        
        train_X, train_Y, mask, lengths = batch
        train_X = train_X.to(device)
        train_Y = train_Y.to(device)
        mask = mask.to(device)
        lengths = torch.tensor(lengths)
        lengths = lengths.to(device)
        
        ELMo_embedding, _ = model_ELMo(train_X, lengths, mask)
        predictions = model_task(ELMo_embedding, lengths)
        predictions = predictions.to(device)
        lossF = nn.CrossEntropyLoss()
        loss = lossF(predictions, train_Y)
        loss = loss.to(device)
        pred_class = torch.argmax(predictions, dim=1)
        correct_guess = sum([1 if pred == act else 0 for pred, act in zip(pred_class, train_Y)])
        accuracy.append((correct_guess, lengths.size()[0]))
        loss.backward(retain_graph=True)
        _ = nn.utils.clip_grad_norm_(model_ELMo.parameters(), clip)
        _ = nn.utils.clip_grad_norm_(model_task.parameters(), clip)
        train_pred += pred_class.tolist()
        train_true += train_Y.tolist()
        ELMo_opt.step()
        task_opt.step()
        list_loss.append(loss.item())
    total_correct = sum([tp[0] for tp in accuracy[-(len(train)):]])
    total_preds = sum([tp[1] for tp in accuracy[-(len(train)):]])
    cur_acc = total_correct/total_preds
    loss_over_data = sum(list_loss[-(len(train)):])/len(train)
    print("Loss for iteration = ", iteration, "is ", loss_over_data, "and accuracy is = ", cur_acc)

Loss for iteration =  0 is  0.695381687238 and accuracy is =  0.52125
Loss for iteration =  1 is  0.690268745789 and accuracy is =  0.5425
Loss for iteration =  2 is  0.687534213066 and accuracy is =  0.5625
Loss for iteration =  3 is  0.684150232719 and accuracy is =  0.575
Loss for iteration =  4 is  0.678868206648 and accuracy is =  0.5875
Loss for iteration =  5 is  0.669945693933 and accuracy is =  0.6275
Loss for iteration =  6 is  0.659065150298 and accuracy is =  0.635
Loss for iteration =  7 is  0.631867903929 and accuracy is =  0.68125
Loss for iteration =  8 is  0.604190033216 and accuracy is =  0.70875
Loss for iteration =  9 is  0.569348660799 and accuracy is =  0.74125


In [28]:
train_mat = make_confusion_matrix(train_true[-800:], train_pred[-800:])

In [29]:
for i in range(2):
    print(train_mat[i][0], train_mat[i][1])

304.0 110.0
97.0 289.0


In [30]:
test_accuracy = []
for batch_n in range(len(test)):
    test_X, test_Y, mask, lengths = test[batch_n]
    test_X = test_X.to(device)
    test_Y = test_Y.to(device)
    mask = mask.to(device)
    lengths = torch.tensor(lengths, device=device)
    
    ELMo_embedding, _ = model_ELMo(test_X, lengths, mask)
    predictions = model_task(ELMo_embedding, lengths)
    predictions = predictions.to(device)
    pred_class = torch.argmax(predictions, dim=1)
    test_true += test_Y.tolist()
    test_pred += pred_class.tolist()
    correct_guess = sum([1 if pred == act else 0 for pred, act in zip(pred_class, test_Y)])
    test_accuracy.append((correct_guess, lengths.size()[0]))
total_correct = sum([tp[0] for tp in test_accuracy[-(len(test)):]])
total_preds = sum([tp[1] for tp in test_accuracy[-(len(test)):]])

print("test accuracy = ", total_correct/total_preds)

test accuracy =  0.57
