In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
from classes import biLM, ELMo
from data_utils import Vocab, batcher, data_loader, make_confusion_matrix, vocab

In [3]:
data_file = 'sentiment/data.txt'
embedding_file = 'glove.6B.50d.txt'

In [4]:
import re
data = data_loader(data_file, vocab, print_every=500)

words, sentiment = ['later', 'i', 'found', 'myself', 'lost', 'in', 'the', 'power', 'of', 'the', 'film'] 1
X, Y = tensor([ 168,   41,  238, 3261,  402,    6,    0,  268,    3,    0,  319],
       device='cuda:0') tensor(1, device='cuda:0')
words, sentiment = ['all', 'in', 'all', 'its', 'an', 'insult', 'to', "one's", 'intelligence', 'and', 'a', 'huge', 'waste', 'of', 'money'] 0
X, Y = tensor([    64,      6,     64,     47,     29,  12965,      4, 400001,   1226,
             5,      7,   1324,   3631,      3,    308], device='cuda:0') tensor(0, device='cuda:0')
words, sentiment = ['well', 'im', 'satisfied'] 1
X, Y = tensor([  143, 14663,  5456], device='cuda:0') tensor(1, device='cuda:0')
words, sentiment = ['you', 'can', 'not', 'answer', 'calls', 'with', 'the', 'unit', 'never', 'worked', 'once'] 0
X, Y = tensor([  81,   86,   36, 2168,  971,   17,    0, 1207,  332,  762,  442],
       device='cuda:0') tensor(0, device='cuda:0')
words, sentiment = ['waitress', 'was', 'sweet', 'and', 'fu

In [23]:
class Task(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, embedding, use_ELMo=True, n_layers=1, bidirectional=True, dropout=0.5):
        super(Task, self).__init__()
        USE_CUDA = torch.cuda.is_available()
        self.device = torch.device("cuda" if USE_CUDA else "cpu")
        self.embedding = embedding
        self.use_ELMo = use_ELMo
        self.drop1 = nn.Dropout(p=dropout)
        self.output_dim = output_dim
        if bidirectional:
            self.num_dir = 2
        else:
            self.num_dir = 1
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, dropout=(0 if n_layers==1 else dropout), bidirectional=bidirectional)
        self.drop2 = nn.Dropout(p=dropout)
        self.dense = nn.Linear(hidden_dim*self.num_dir, output_dim)
        self.soft = nn.Softmax(dim=-1)
        
    def forward(self, input_seq, input_lengths, mask):
        input_seq = input_seq.t()
        mask = mask.t()
        if self.use_ELMo:
#             flg = True
#             if flg:
#                 flg = False
#                 print(input_seq.size(), input_lengths.size(), mask.size())
            embedded, _ = self.embedding(input_seq, input_lengths, mask)
        else:
            embedded = self.embedding(input_seq)
        outputs = torch.zeros(embedded.size()[1], self.output_dim, device=self.device)
        for batch in range(embedded.size()[1]):
            dropped1 = self.drop1(embedded[:,batch,:])
            out, _ = self.lstm(torch.unsqueeze(dropped1[:input_lengths[batch], :], 1))
            dropped2 = self.drop2(out[-1,0,:])
            densed = self.dense(dropped2)
            softed = self.soft(densed)
            outputs[batch, :] = softed
        return outputs

In [6]:
MAX_LEN = 15
batch_size = 256
test_split_ratio = 0.25
input_dim = 50
hidden_dim = 25
output_dim = 2

In [7]:
train, test = batcher(data, MAX_LEN, batch_size, test_ratio=test_split_ratio)

In [24]:
model = Task(input_dim, hidden_dim, output_dim, vocab.embedding, use_ELMo=False, dropout=0.4)

In [8]:
learning_rate = 0.002
clip = 50.0
n_epochs = 32

In [25]:
model_optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [14]:
def evaluate(model, test):
    with torch.no_grad():
        model = model.to(device)
        model.eval()
        
        test_true = []
        test_pred = []

        for batch in test:
            test_X, test_Y, mask, lengths = batch
            test_X = test_X.to(device)
            test_Y = test_Y.to(device)
            mask = mask.to(device)
            lengths = torch.tensor(lengths, device=device)

            pred_prob = model(test_X.t(), lengths, mask.t())
            pred_cls = torch.argmax(pred_prob, -1)
            
            test_pred += pred_cls.tolist()
            test_true += test_Y.tolist()
            
        test_mat = make_confusion_matrix(test_true, test_pred)
        
        total_pred = np.sum(test_mat)
        true_pred = sum([test_mat[i][i] for i in range(test_mat.shape[0])])
        
        print("test acc =", true_pred / total_pred)
        
    return test_mat, test_pred, test_true

In [15]:
def train_model(model, model_optimizer, train, test, n_epochs, clip, device):
    from random import shuffle
    
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        model = nn.DataParallel(model)

    model.to(device)
    
    train_true = []
    train_pred = []
    train_loss = []
    
    for epoch in range(n_epochs):
        curr_true = []
        curr_pred = []
        model.train()
        
        for batch in train:
            
            model_optimizer.zero_grad()
            
            train_X, train_Y, mask, lengths = batch
            train_X = train_X.to(device)
            train_Y = train_Y.to(device)
            mask = mask.to(device)
            lengths = torch.tensor(lengths, device=device)
            
            pred_prob = model(train_X.t(), lengths, mask.t())
            
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(pred_prob, train_Y)
            loss = loss.to(device)
            train_loss.append(loss.item() * lengths.size()[0])
            
            loss.backward(retain_graph=True)
            
            _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
            model_optimizer.step()
            
            pred_cls = torch.argmax(pred_prob, -1)
            
            curr_pred += pred_cls.tolist()
            curr_true += train_Y.tolist()
        
        train_true.append(curr_true)
        train_pred.append(curr_true)
        train_mat = make_confusion_matrix(curr_true, curr_pred)
        
        total_pred = np.sum(train_mat)
        true_pred = sum([train_mat[i][i] for i in range(train_mat.shape[0])])
        
        cur_loss = sum(train_loss[-len(train):]) / total_pred
        
        print("epoch =",epoch, "loss = ", cur_loss.item(), "train acc =", true_pred / total_pred)
        
        evaluate(model, test[:4])
        shuffle(test)
        shuffle(train)
    return train_loss

In [26]:
train_loss_without_ELMo = train_model(model, model_optimizer, train, test, n_epochs, clip, device)

Let's use 4 GPUs!
epoch = 0 loss =  0.6941838192939759 train acc = 0.5111111111111111
test acc = 0.5106666666666667
epoch = 1 loss =  0.690430541197459 train acc = 0.532
test acc = 0.512
epoch = 2 loss =  0.6904797972043355 train acc = 0.5262222222222223
test acc = 0.5093333333333333
epoch = 3 loss =  0.6874054523044162 train acc = 0.548
test acc = 0.508
epoch = 4 loss =  0.6874714207119412 train acc = 0.5471111111111111
test acc = 0.5186666666666667
epoch = 5 loss =  0.6819676832093133 train acc = 0.5737777777777778
test acc = 0.516
epoch = 6 loss =  0.6815665980445014 train acc = 0.5773333333333334
test acc = 0.5266666666666666
epoch = 7 loss =  0.6759660246107313 train acc = 0.5835555555555556
test acc = 0.5373333333333333
epoch = 8 loss =  0.6703267672326829 train acc = 0.5857777777777777
test acc = 0.5693333333333334
epoch = 9 loss =  0.6669372226397197 train acc = 0.6008888888888889
test acc = 0.56
epoch = 10 loss =  0.6603947127130296 train acc = 0.6186666666666667
test acc = 0.

In [27]:
test_mat_w, test_pred_w, test_true_w = evaluate(model, test)

test acc = 0.6493333333333333


In [16]:
model_with_ELMo = Task(input_dim,
                       hidden_dim,
                       output_dim,
                       ELMo(input_dim, vocab.embedding, n_layers=2, dropout=0.4),
                       use_ELMo=True,
                       dropout=0.4)

In [17]:
model_opt = optim.Adam(model_with_ELMo.parameters(), lr=learning_rate)

In [18]:
train_loss = train_model(model_with_ELMo, model_opt, train, test, n_epochs, clip, device)

Let's use 4 GPUs!
epoch = 0 loss =  0.6932003099123637 train acc = 0.5017777777777778
test acc = 0.46
epoch = 1 loss =  0.6926024600134956 train acc = 0.5213333333333333
test acc = 0.4613333333333333
epoch = 2 loss =  0.6918900816175673 train acc = 0.5204444444444445
test acc = 0.464
epoch = 3 loss =  0.6901715452406142 train acc = 0.5453333333333333
test acc = 0.5173333333333333
epoch = 4 loss =  0.6839425075319078 train acc = 0.5782222222222222
test acc = 0.5373333333333333
epoch = 5 loss =  0.6759990796513028 train acc = 0.604
test acc = 0.5506666666666666
epoch = 6 loss =  0.6707493426005046 train acc = 0.6004444444444444
test acc = 0.532
epoch = 7 loss =  0.6687986068195767 train acc = 0.5937777777777777
test acc = 0.584
epoch = 8 loss =  0.6602253426445855 train acc = 0.6102222222222222
test acc = 0.576
epoch = 9 loss =  0.6604879176351759 train acc = 0.6035555555555555
test acc = 0.5733333333333334
epoch = 10 loss =  0.6479963329633077 train acc = 0.6342222222222222
test acc = 0

In [19]:
test_mat, test_pred, test_true = evaluate(model_with_ELMo, test)

test acc = 0.7013333333333334
