In [1]:
from collections import OrderedDict
from data_iterator import TextIterator
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import pickle as pkl
import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F

num_words = 100140
batch_size=32
valid_batch_size=32
dim_word = 300
char_nout = 100
dim_char_emb = 15
learning_rate = 0.01
dim_hidden = 600

In [2]:
temp = Variable(torch.Tensor([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]]))

y = [1]
Variable(torch.Tensor(y))

Variable containing:
 1
[torch.FloatTensor of size 1]

In [3]:
def str2list(s):
    alphabet = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    l = len(s)
    ans = []
    for i in range(0, l):
        a = alphabet.find(s[i])
        if a >= 0:
            ans.append(a)
        else:
            ans.append(0)
            #print(s[i])
    return ans

In [4]:
def prepare_data(seqs_x, seqs_y, labels, worddicts_r, maxlen=None):
    # x: a list of sentences
    lengths_x = [len(s) for s in seqs_x]
    lengths_y = [len(s) for s in seqs_y]

    if maxlen is not None:
        new_seqs_x = []
        new_seqs_y = []
        new_lengths_x = []
        new_lengths_y = []
        new_labels = []
        for l_x, s_x, l_y, s_y, l in zip(lengths_x, seqs_x, lengths_y, seqs_y, labels):
            if l_x < maxlen and l_y < maxlen:
                new_seqs_x.append(s_x)
                new_lengths_x.append(l_x)
                new_seqs_y.append(s_y)
                new_lengths_y.append(l_y)
                new_labels.append(l)
        lengths_x = new_lengths_x
        seqs_x = new_seqs_x
        lengths_y = new_lengths_y
        seqs_y = new_seqs_y
        labels = new_labels

        if len(lengths_x) < 1 or len(lengths_y) < 1:
            return None

    max_char_len_x = 0
    max_char_len_y = 0
    seqs_x_char = []
    l_seqs_x_char = []
    seqs_y_char = []
    l_seqs_y_char = []

    for idx, [s_x, s_y, s_l] in enumerate(zip(seqs_x, seqs_y, labels)):
        temp_seqs_x_char = []
        temp_l_seqs_x_char = []
        temp_seqs_y_char = []
        temp_l_seqs_y_char = []
        for w_x in s_x:
            word = worddicts_r[w_x]
            word_list = str2list(word)
            l_word_list = len(word_list)
            temp_seqs_x_char.append(word_list)
            temp_l_seqs_x_char.append(l_word_list)
            if l_word_list >= max_char_len_x:
                max_char_len_x = l_word_list
        for w_y in s_y:
            word = worddicts_r[w_y]
            word_list = str2list(word)
            l_word_list = len(word_list)
            temp_seqs_y_char.append(word_list)
            temp_l_seqs_y_char.append(l_word_list)
            if l_word_list >= max_char_len_y:
                max_char_len_y = l_word_list

        seqs_x_char.append(temp_seqs_x_char)
        l_seqs_x_char.append(temp_l_seqs_x_char)
        seqs_y_char.append(temp_seqs_y_char)
        l_seqs_y_char.append(temp_l_seqs_y_char)

    n_samples = len(seqs_x)
    maxlen_x = numpy.max(lengths_x)
    maxlen_y = numpy.max(lengths_y)

    x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
    y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
    x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
    y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
    l = numpy.zeros((n_samples,)).astype('int64')
    char_x = numpy.zeros((maxlen_x, n_samples, max_char_len_x)).astype('int64')
    char_x_mask = numpy.zeros((maxlen_x, n_samples, max_char_len_x)).astype('float32')
    char_y = numpy.zeros((maxlen_y, n_samples, max_char_len_y)).astype('int64')
    char_y_mask = numpy.zeros((maxlen_y, n_samples, max_char_len_y)).astype('float32')

    for idx, [s_x, s_y, ll] in enumerate(zip(seqs_x, seqs_y, labels)):
        x[:lengths_x[idx], idx] = s_x
        x_mask[:lengths_x[idx], idx] = 1.
        y[:lengths_y[idx], idx] = s_y
        y_mask[:lengths_y[idx], idx] = 1.
        l[idx] = ll

        for j in range(0, lengths_x[idx]):
            char_x[j, idx, :l_seqs_x_char[idx][j]] = seqs_x_char[idx][j]
            char_x_mask[j, idx, :l_seqs_x_char[idx][j]] = 1.
        for j in range(0, lengths_y[idx]):
            char_y[j, idx, :l_seqs_y_char[idx][j]] = seqs_y_char[idx][j]
            char_y_mask[j, idx, :l_seqs_y_char[idx][j]] = 1.

    return x, x_mask, char_x, char_x_mask, y, y_mask, char_y, char_y_mask, l


In [5]:
class LSTM(nn.Module):
    def __init__(self, nin, dim):
        super(LSTM, self).__init__()
        
        # input weights
        self.W = Parameter(torch.Tensor(nin, 4*dim))
        
        # for the previous hidden activation
        self.U = Parameter(torch.Tensor(dim, 4*dim))

        self.b = Parameter(torch.Tensor(4*dim,))
        
        
        self.init_params(nin, dim)

    def init_params(self, nin, dim):
        self.W.data = torch.from_numpy(numpy.concatenate([self.norm_weight(nin, dim),
                               self.norm_weight(nin, dim),
                               self.norm_weight(nin, dim),
                               self.norm_weight(nin, dim)], axis=1))

        self.U.data = torch.from_numpy(numpy.concatenate([self.ortho_weight(dim),
                               self.ortho_weight(dim),
                               self.ortho_weight(dim),
                               self.ortho_weight(dim)], axis=1))
        
        self.b.data = torch.from_numpy(numpy.zeros((4 * dim,)).astype('float32'))
                                                        
    def slice_d(self, _x, n, dim):
        if len(_x.size()) == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        elif len(_x.size()) == 2:
            return _x[:, n * dim:(n + 1) * dim]
        return _x[n * dim:(n + 1) * dim]


    # one time step of the lstm
    def step(self, m_, x_, h_, c_, dim):
        preact = torch.mm(h_, self.U)
        preact += x_


        i = F.sigmoid(self.slice_d(preact, 0, dim))
        f = F.sigmoid(self.slice_d(preact, 1, dim))
        o = F.sigmoid(self.slice_d(preact, 2, dim))
        c = F.tanh(self.slice_d(preact, 3, dim))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * F.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c, i, f, o, preact
    

    
    # This function implements the lstm fprop
    def forward(self, state_below, mask):
        nsteps = state_below.size(0)
        dim = self.U.size(0)

        n_samples = state_below.size(1)
        init_state = Variable(torch.zeros(n_samples, dim))
        init_memory = Variable(torch.zeros(n_samples, dim))
        state_below = torch.bmm(state_below, self.W.unsqueeze(0).expand(state_below.size(0), *self.W.size())) + self.b
        
        h_hist = []
        i_hist = []
        for current_step in range(nsteps):
            if(current_step==0):
                h_last, c_last, i_last,_,_,_ = self.step(mask[current_step], state_below[current_step], init_state, init_memory, dim)
            else :
                h_last, c_last, i_last,_,_,_ = self.step(mask[current_step], state_below[current_step], h_last, c_last, dim)

           
            h_hist.append(h_last[None, :, :])
            i_hist.append(i_last[None, :, :])

        return torch.cat(h_hist), torch.cat(i_hist)

    # use the slice to calculate all the different gates
    def norm_weight(self, nin, nout=None, scale=0.01, ortho=True):
        if nout is None:
            nout = nin
        if nout == nin and ortho:
            W = self.ortho_weight(nin)
        else:
            W = scale * numpy.random.randn(nin, nout)
        return W.astype('float32')

    def ortho_weight(self, ndim):
        W = numpy.random.randn(ndim, ndim)
        u, s, v = numpy.linalg.svd(W)
        return u.astype('float32')


In [9]:

class NLI(nn.Module):
    def __init__(self, dim_word, char_nout, dim_char_emb, word_embeddings_file, worddict, num_words, dim_hidden):
        super(NLI, self).__init__()
        self.dim_word = dim_word
        self.char_nout = char_nout
        self.dim_char_emb = dim_char_emb 
        self.char_k_cols = dim_char_emb
        self.char_k_rows=[1,3,5]
        self.hidden_size = dim_hidden
        self.word_embeddings = self.create_word_embeddings(word_embeddings_file, worddict, num_words, dim_word)
        
        dim_emb = dim_word + 3*char_nout
        self.alphabet = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
        self.filter1 = None
        self.filter2 = None
        self.filter3 = None
        
        self.LSTM1 = LSTM(dim_emb, dim_hidden)
        self.LSTM2 = LSTM(dim_emb+2*dim_hidden, dim_hidden)
        self.LSTM3 = LSTM(dim_emb+2*dim_hidden, dim_hidden)
        
        self.Linear1 = nn.Linear(24*dim_hidden, dim_hidden)
        self.Linear2 = nn.Linear(25*dim_hidden, dim_hidden)
        self.Linear3 = nn.Linear(dim_hidden, 3)

        self.init_weights(dim_hidden)

    def create_word_embeddings(self, file_name, worddicts, num_words, dim_word):
        word_embeddings = Variable(torch.from_numpy(self.norm_weight(num_words, dim_word)))
        
        with open(file_name, 'r') as f:
            for line in f:
                tmp = line.split()
                word = tmp[0]
                vector = tmp[1:]
                len_vec = len(vector)
                
                if(len_vec>300):
                    diff = len_vec-300
                    word = word.join(vector[:diff])
                    vector = vector[diff:]
                    
                    
                if word in worddicts and worddicts[word] < num_words:
                    vector = [float(x) for x in vector]
                    word_embeddings[worddicts[word], :] = torch.FloatTensor(vector[0:300])
            
        return word_embeddings
        
    def forward(self, premise, char_premise, premise_mask, char_premise_mask, hypothesis, char_hypothesis, hypothesis_mask, char_hypothesis_mask,l,y):
        #premise = number of words * number of samples. Also hypothesis = number of words * number of samples
        n_timesteps_premise = premise.size(0)
        n_timesteps_hypothesis = hypothesis.size(0)
        n_samples = premise.size(1)
        
        premise_char_vector = self.compute_character_embeddings(char_premise, n_timesteps_premise, n_samples, char_premise_mask)
        hypothesis_char_vector = self.compute_character_embeddings(char_hypothesis, n_timesteps_hypothesis, n_samples, char_hypothesis_mask)

        premise_word_emb = self.word_embeddings[premise.view(-1)].view(n_timesteps_premise, n_samples, self.dim_word)
        hypothesis_word_emb = self.word_embeddings[hypothesis.view(-1)].view(n_timesteps_hypothesis, n_samples, self.dim_word)
        
        hypothesis_emb = torch.cat([hypothesis_word_emb, hypothesis_char_vector], 2)
        premise_emb = torch.cat([premise_word_emb, premise_char_vector], 2)
        
    
        premise_seq, premise_rev_seq = self.sequence_encoder(premise_emb, premise_mask)
        hypothesis_seq, hypothesis_rev_seq = self.sequence_encoder(hypothesis_emb, hypothesis_mask)
        
        premise_comp = self.make_composite_vector(premise_seq, premise_rev_seq, premise_mask)
        hypothesis_comp = self.make_composite_vector(hypothesis_seq, hypothesis_rev_seq, hypothesis_mask)
    
        logit_0 = torch.cat([premise_comp, hypothesis_comp, torch.abs(premise_comp - hypothesis_comp), premise_comp * hypothesis_comp],1)
        logit = F.relu(self.Linear1(logit_0))
        logit = torch.cat([logit_0, logit], 1)
        logit = F.relu(self.Linear2(logit))
        logit = self.Linear3(logit)
        probs = F.softmax(logit)
        
        '''

        '''
        return probs
    
    def sequence_encoder(self, emb, mask):
        reverse_emb = self.reverseTensor(emb)
        reverse_mask = self.reverseTensor(mask)
        
        #  LSTM1
        seq1 = self.LSTM1(emb, mask)
        seq_reverse1 = self.LSTM1(reverse_emb, reverse_mask)
        
        inp_seq2 = torch.cat([seq1[0], self.reverseTensor(seq_reverse1[0])], len(seq1[0].size()) - 1)
        inp_seq2 = torch.cat([inp_seq2,emb],2)
        reverse_inp_seq2 = self.reverseTensor(inp_seq2)
        
        #  LSTM2
        seq2 = self.LSTM2(inp_seq2, mask)
        seq_reverse2 = self.LSTM2(reverse_inp_seq2, reverse_mask)
        
        inp_seq3 = torch.cat([seq2[0], self.reverseTensor(seq_reverse2[0])], len(seq2[0].size()) - 1)
        inp_seq3 = torch.cat([inp_seq3,emb],2)
        reverse_inp_seq3 = self.reverseTensor(inp_seq3)

        #  LSTM3
        seq3 = self.LSTM3(inp_seq3, mask)
        seq_reverse3 = self.LSTM3(reverse_inp_seq3, reverse_mask)
        
        return seq3,seq_reverse3
        
    def make_composite_vector(self, seq, seq_rev, mask):
        output = torch.cat([seq[0], self.reverseTensor(seq_rev[0])], len(seq[0].size()) - 1)
        
        gate = torch.cat([seq[1], self.reverseTensor(seq_rev[1])], len(seq[1].size()) - 1)
        gate = gate.norm(2, 2)

        mean = (output * mask[:, :, None]).sum(0) / mask.sum(0)[:, None]
        maxi = (output * mask[:, :, None]).max(0)[0]
        gate_2 = (output * gate[:, :, None] * mask[:, :, None]).sum(0) / (gate[:, :, None] * mask[:, :, None]).sum(0)
        rep = torch.cat([mean, maxi, gate_2],1)
        return rep

    def reverseTensor(self, tensor):
        idx = [i for i in range(tensor.size(0)-1, -1, -1)]
        idx = Variable(torch.LongTensor(idx))
        inverted_tensor = tensor.index_select(0, idx)
        return inverted_tensor 
        
    def compute_character_embeddings(self, chars_word, n_timesteps, num_samples, char_mask):
        emb_char = self.Charemb[chars_word.view(-1)].view(n_timesteps, num_samples, chars_word.size(2), self.dim_char_emb)
        emb_char = emb_char * char_mask[:, :, :, None]
        emb_char_inp = emb_char.view(n_timesteps * num_samples, 1, chars_word.size(2), self.dim_char_emb)

        char_level_emb1 = self.apply_filter_and_get_char_embedding(self.filter1, emb_char_inp, num_samples, n_timesteps)
        char_level_emb2 = self.apply_filter_and_get_char_embedding(self.filter2, emb_char_inp, num_samples, n_timesteps)
        char_level_emb3 = self.apply_filter_and_get_char_embedding(self.filter3, emb_char_inp, num_samples, n_timesteps)
        
        emb_chars = [char_level_emb1, char_level_emb2, char_level_emb3]
        emb_char = torch.cat(emb_chars,2)
        return emb_char
    
    def apply_filter_and_get_char_embedding(self, filter_type, emb_char_inp, n_samples, n_timesteps):
        emb_char = F.conv2d(emb_char_inp, filter_type)
        emb_char = F.relu(emb_char)
        emb_char = emb_char.view(n_timesteps * n_samples, self.char_nout, emb_char.size(2))
        emb_char = emb_char.max(2)[0]
        emb_char = emb_char.view(n_timesteps, n_samples, self.char_nout)
        return emb_char
        
        
    def init_weights(self, dim):
        initrange = 0.1
        self.Charemb = Variable(torch.from_numpy(self.norm_weight(len(self.alphabet) + 1, self.dim_char_emb)), requires_grad = True)
        self.filter1 = self.getFilter(self.char_k_rows[0])
        self.filter2 = self.getFilter(self.char_k_rows[1])
        self.filter3 = self.getFilter(self.char_k_rows[2])
        
        
        self.Linear1.weight.data.uniform_(-initrange, initrange)
        self.Linear1.bias.data.fill_(0)
        self.Linear2.weight.data.uniform_(-initrange, initrange)
        self.Linear2.bias.data.fill_(0)
        self.Linear3.weight.data.uniform_(-initrange, initrange)
        self.Linear3.bias.data.fill_(0)


       
    def getFilter(self, char_k_row):
        w_shp = (self.char_nout, 1, char_k_row, self.char_k_cols)
        w_bound = numpy.sqrt(3 * char_k_row * self.char_k_cols)
        return Variable(torch.from_numpy(numpy.random.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=w_shp).astype('float32')), requires_grad = True)



    def norm_weight(self, nin, nout=None, scale=0.01, ortho=True):
        if nout is None:
            nout = nin
        if nout == nin and ortho:
            W = self.ortho_weight(nin)
        else:
            W = scale * numpy.random.randn(nin, nout)
        return W.astype('float32')

    def ortho_weight(self, ndim):
        W = numpy.random.randn(ndim, ndim)
        u, s, v = numpy.linalg.svd(W)
        return u.astype('float32')



In [10]:
train = TextIterator('word_sequence/premise_multinli_1.0_train.txt',
                     'word_sequence/hypothesis_multinli_1.0_train.txt',
                     'word_sequence/label_multinli_1.0_train.txt',
                     'word_sequence/vocab_cased.pkl',
                      n_words=num_words,
                      batch_size=batch_size)
valid = TextIterator('word_sequence/premise_multinli_1.0_dev_matched.txt',
                     'word_sequence/hypothesis_multinli_1.0_dev_matched.txt',
                     'word_sequence/label_multinli_1.0_dev_matched.txt',
                     'word_sequence/vocab_cased.pkl',
                      n_words=num_words,
                      batch_size=valid_batch_size,
                      shuffle=False)

test = TextIterator('word_sequence/premise_multinli_1.0_dev_mismatched.txt',
                    'word_sequence/hypothesis_multinli_1.0_dev_mismatched.txt',
                    'word_sequence/label_multinli_1.0_dev_mismatched.txt',
                    'word_sequence/vocab_cased.pkl',
                    n_words=num_words,
                    batch_size=valid_batch_size,
                    shuffle=False)

with open('word_sequence/vocab_cased.pkl', 'rb') as f:
    worddicts = pkl.load(f)
worddicts_r = dict()

for kk, vv in worddicts.items():
    worddicts_r[vv] = kk


In [11]:

model = NLI(dim_word, char_nout, dim_char_emb, 'data/glove.840B.300d.txt', worddicts, num_words, dim_hidden)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

print('training')
loss = nn.CrossEntropyLoss() 

for x1, x2, y in train:

    premise, premise_mask, char_premise, char_premise_mask, hypothesis, hypothesis_mask, char_hypothesis, char_hypothesis_mask, l = prepare_data(x1,x2,y,worddicts_r)
    premise = Variable(torch.from_numpy(premise))
    premise_mask = Variable(torch.from_numpy(premise_mask))
    char_premise = Variable(torch.from_numpy(char_premise))
    char_premise_mask = Variable(torch.from_numpy(char_premise_mask))
    hypothesis = Variable(torch.from_numpy(hypothesis))
    hypothesis_mask = Variable(torch.from_numpy(hypothesis_mask))
    char_hypothesis = Variable(torch.from_numpy(char_hypothesis))
    char_hypothesis_mask = Variable(torch.from_numpy(char_hypothesis_mask))
    l = Variable(torch.from_numpy(l))
    y = [int(data) for data in y]
    y = Variable(torch.LongTensor(y).squeeze())
    optimizer.zero_grad()

    outputs = model(premise, char_premise, premise_mask, char_premise_mask, hypothesis, char_hypothesis, hypothesis_mask, char_hypothesis_mask,l,y)
    lossy = loss(outputs, y)
    print(lossy)
    lossy.backward()
    #torch.nn.utils.clip_grad_norm(model.parameters(), 5.0)
    optimizer.step()





training
Variable containing:
 1.1003
[torch.FloatTensor of size 1]

Variable containing:
 1.2077
[torch.FloatTensor of size 1]

Variable containing:
 1.0514
[torch.FloatTensor of size 1]

Variable containing:
 1.2702
[torch.FloatTensor of size 1]

Variable containing:
 1.0514
[torch.FloatTensor of size 1]

Variable containing:
 1.2702
[torch.FloatTensor of size 1]

Variable containing:
 1.2389
[torch.FloatTensor of size 1]

Variable containing:
 1.1452
[torch.FloatTensor of size 1]

Variable containing:
 1.2077
[torch.FloatTensor of size 1]

Variable containing:
 1.1452
[torch.FloatTensor of size 1]

Variable containing:
 1.2077
[torch.FloatTensor of size 1]

Variable containing:
 1.2077
[torch.FloatTensor of size 1]

Variable containing:
 1.2702
[torch.FloatTensor of size 1]

Variable containing:
 1.2389
[torch.FloatTensor of size 1]

Variable containing:
 1.2389
[torch.FloatTensor of size 1]

Variable containing:
 1.3639
[torch.FloatTensor of size 1]



KeyboardInterrupt: 