In [330]:
# Required imports
import torch
import numpy as np
import pandas as pd
import pickle
from torch.nn import Linear, Embedding, RNN, GRU, LSTM
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD, Adam
from torch.nn import BCELoss, NLLLoss, CrossEntropyLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [3]:
data = pd.read_pickle('../data/4_europarl_en_sp.pkl')

print(data.shape[0])
data.head()

11618


Unnamed: 0,english,spanish,text,label
600702,Allow us to state that the opening of the inqu...,Permita que le digamos que las investigaciones...,"[allow, us, to, state, that, the, opening, of,...","[permita, que, le, digamos, que, las, investig..."
3733,"Since June of last year, the OLAF regulation h...",Desde junio del año pasado el Reglamento de la...,"[since, june, of, last, year,, the, olaf, regu...","[desde, junio, del, año, pasado, el, reglament..."
1180999,I am fully aware of the views expressed here o...,Soy plenamente consciente de las opiniones exp...,"[i, am, fully, aware, of, the, views, expresse...","[soy, plenamente, consciente, de, las, opinion..."
109251,I will have clarification of that situation la...,Hoy mismo recibiré información sobre dicha sit...,"[i, will, have, clarification, of, that, situa...","[hoy, mismo, recibiré, información, sobre, dic..."
1401751,"Let us not forget, the Roma were the first to ...",No olvidemos que la población gitana fue la pr...,"[let, us, not, forget,, the, roma, were, the, ...","[no, olvidemos, que, la, población, gitana, fu..."


In [4]:
data['text'] = data['text'].map(lambda x: ['<SOS>'] + x + ['<EOS>'])
data['label'] = data['label'].map(lambda x: ['<SOS>'] + x + ['<EOS>'])
data.head()

Unnamed: 0,english,spanish,text,label
600702,Allow us to state that the opening of the inqu...,Permita que le digamos que las investigaciones...,"[<SOS>, allow, us, to, state, that, the, openi...","[<SOS>, permita, que, le, digamos, que, las, i..."
3733,"Since June of last year, the OLAF regulation h...",Desde junio del año pasado el Reglamento de la...,"[<SOS>, since, june, of, last, year,, the, ola...","[<SOS>, desde, junio, del, año, pasado, el, re..."
1180999,I am fully aware of the views expressed here o...,Soy plenamente consciente de las opiniones exp...,"[<SOS>, i, am, fully, aware, of, the, views, e...","[<SOS>, soy, plenamente, consciente, de, las, ..."
109251,I will have clarification of that situation la...,Hoy mismo recibiré información sobre dicha sit...,"[<SOS>, i, will, have, clarification, of, that...","[<SOS>, hoy, mismo, recibiré, información, sob..."
1401751,"Let us not forget, the Roma were the first to ...",No olvidemos que la población gitana fue la pr...,"[<SOS>, let, us, not, forget,, the, roma, were...","[<SOS>, no, olvidemos, que, la, población, git..."


In [5]:
input_words = set(itertools.chain.from_iterable(data['text']))
output_words = set(itertools.chain.from_iterable(data['label']))

input2idx = {word: idx for idx, word in enumerate(input_words)}
idx2input = {idx: word for word, idx in input2idx.items()}

output2idx = {word: idx for idx, word in enumerate(output_words)}
idx2putput = {idx: word for word, idx in output2idx.items()}

input_size = len(input_words)
output_size = len(output_words)

In [6]:
output_size

21842

In [7]:
translation_indices = {
                        'input2idx': input2idx, 
                        'idx2input': idx2input, 
                        'output2idx': output2idx, 
                        'idx2putput': idx2putput
                      }

with open('../data/translation_indices.pkl', 'wb') as f:
    pickle.dump(translation_indices, f)

In [8]:
input_seqs = data['text'].map(lambda x: [input2idx[i] for i in x]).tolist()
output_seqs = data['label'].map(lambda x: [output2idx[i] for i in x]).tolist()

data = list(zip(input_seqs, output_seqs))

train_data, test_data = train_test_split(data)

In [407]:
class encoder(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size):
        super(encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = LSTM(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.batch_size = batch_size
        self.softmax = LogSoftmax()
        self.hidden = self.init_hidden()
                
    def forward(self, x):
        e = self.embedding(x)
        e = e.view(len(x), self.batch_size, -1)
        out, self.hidden = self.rnn(e, self.hidden)
        return out, self.hidden
                  
    def init_hidden(self):
        h0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)
    
class decoder(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, batch_size):
        super(decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = LSTM(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.linear = Linear(hidden_dim, output_dim)
        self.batch_size = batch_size
        self.softmax = LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
                
    def forward(self, input, hidden):
        self.hidden = hidden
        e = self.embedding(input)
        e = e.view(len(input), self.batch_size, -1)
        out, self.hidden = self.rnn(e, self.hidden)
        output = self.linear(out[0])
        so = self.softmax(output)
        return so, self.hidden
                  
    def init_hidden(self):
        h0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)
    
class seq2seq(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        self.enc = encoder
        self.dec = decoder
                
    def forward(self, input_seq, output_seq, p_tf=0):
        outputs = []
        
        self.enc.hidden = self.enc.init_hidden()
        self.dec.hidden = self.dec.init_hidden()        
        
        enc_output, enc_hidden = enc.forward(torch.LongTensor(input_seq))
        dec_hidden = enc_hidden
        
        for i in range(output_seq.shape[0]):
            dec_input = torch.LongTensor([output_seq[i]])
            dec_output, dec_hidden = self.dec.forward(dec_input, dec_hidden) 
            outputs.append(dec_output)
        #return torch.stack(outputs).squeeze(1)
        return outputs


In [408]:
enc_vocab_size = input_size
enc_embedding_dim = 100
enc_hidden_dim = 50

dec_vocab_size = output_size
dec_embedding_dim = 50
dec_hidden_dim = 50
dec_output_dim = output_size

enc = encoder(enc_vocab_size, enc_embedding_dim, enc_hidden_dim, batch_size=1)
dec = decoder(dec_vocab_size, dec_embedding_dim, dec_hidden_dim, dec_output_dim, batch_size=1)
s2s = seq2seq(enc, dec)


optim = SGD(params=s2s.parameters(), lr=0.01)
criterion = NLLLoss()


In [477]:
epochs = 1
for epoch in range(epochs):
    s2s.train()
    total_loss = 0
    s2s.train()
    y_test_pred = []
    y_test_true = []
    y_train_pred = []
    y_train_true = []
    for it, example in enumerate(train_data):
        if (it % 100 == 0) and (it != 0):
            print("Epoch|it: {}|{}, Total Loss: {:.2f}".format(epoch, it, total_loss / it))
            print(loss.data.numpy())
            print(preds)
            print(torch.exp(torch.stack(res_raw).squeeze(1)[:,9101]))
        input_seq, output_seq = example
        optim.zero_grad()

        input_seq = torch.LongTensor(input_seq)
        
        
        output_seq = torch.LongTensor(output_seq)    
        res_raw = s2s.forward(input_seq, output_seq[:-1])
        res = torch.stack(res_raw).squeeze(1)
        loss = criterion(res, output_seq[1:])
        loss.backward()
        total_loss += loss.data.numpy()

        optim.step()

        
        preds = list(torch.argmax(res, dim=1).data.numpy())
        trues = list(output_seq.data.numpy())
        y_train_true.extend(trues)
        y_train_pred.extend(preds)
        
    a_train = accuracy_score(y_train_true, y_train_pred)
    
    y_test_pred = []
    y_test_true = []

Epoch|it: 0|100, Total Loss: 9.82
9.757656
[9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101]
tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001],
       grad_fn=<ExpBackward>)
Epoch|it: 0|200, Total Loss: 9.79
8.636137
[9101]
tensor([0.0002], grad_fn=<ExpBackward>)
Epoch|it: 0|300, Total Loss: 9.78
9.729606
[9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101]
tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001,
        0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
        0.0002, 0.0002, 0.0002, 0.0002], grad_fn=<ExpBackward>)
Epoch|it: 0|400, Total Loss: 9.77
9.747605
[9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101]
tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
        0.0002, 0.0002, 0.0002, 0.0003, 0.000

KeyboardInterrupt: 

In [481]:
criterion(res, output_seq[1:])

tensor(9.7085, grad_fn=<NllLossBackward>)

In [482]:
output_seq[1:]

tensor([ 6969,   914,  9921,  7474,  8462,  5761,  6727, 17023,  9921, 13449,
        10371,  6081, 11620,  6314,  3911, 15170,  7101, 21341,  9101])

In [484]:
criterion?

In [459]:
[4.3980e-05, 4.3541e-05, 4.5462e-05, 5.0658e-05, 4.7762e-05, 5.2381e-05,
        5.5144e-05, 5.4351e-05, 4.9266e-05, 5.4104e-05, 5.1706e-05]

[4.398e-05,
 4.3541e-05,
 4.5462e-05,
 5.0658e-05,
 4.7762e-05,
 5.2381e-05,
 5.5144e-05,
 5.4351e-05,
 4.9266e-05,
 5.4104e-05,
 5.1706e-05]

In [322]:
input_seq = torch.LongTensor(input_seqs[3])
c, h = s2s.enc.forward(input_seq)

output, hidden = s2s.dec.forward(torch.LongTensor([15310]), h)

pred = torch.argmax(output)
for i in range(30):
    pred = torch.argmax(output)
    output, hidden = s2s.dec.forward(torch.LongTensor([pred]), hidden)
    print(idx2putput[int(pred.numpy())])

<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>
<EOS>




In [175]:
epochs = 1
for epoch in range(epochs):
    s2s.train()
    total_loss = 0
    s2s.train()
    y_test_pred = []
    y_test_true = []
    y_train_pred = []
    y_train_true = []
    for it, example in enumerate(train_data):
        if (it % 500 == 0) and (it != 0):
            print("Epoch|it: {}|{}, Total Loss: {:.2f}".format(epoch, it, total_loss / it))
        input_seq, output_seq = example
        enc_optim.zero_grad()
        dec_optim.zero_grad()

        input_seq = torch.LongTensor(input_seq)
        output_seq = torch.LongTensor(output_seq)    
        res = s2s.forward(input_seq, output_seq)
        loss = criterion(res, torch.LongTensor(output_seq))
        loss.backward()
        total_loss += loss.data.numpy()

        enc_optim.step()
        dec_optim.step()
        
        preds = list(torch.argmax(res, dim=1).data.numpy())
        trues = list(output_seq.data.numpy())
        y_train_true.extend(trues)
        y_train_pred.extend(preds)
        
    a_train = accuracy_score(y_train_true, y_train_pred)
    
    s2s.eval()
    y_test_pred = []
    y_test_true = []
    """
    for example in test_data:

        input_seq, output_seq = example

        input_seq = torch.LongTensor(input_seq)
        output_seq = torch.LongTensor(output_seq)    
        res = s2s.forward(input_seq, output_seq)
        preds = list(torch.argmax(res, dim=1).data.numpy())
        trues = list(output_seq.data.numpy())

        y_test_true.extend(trues)
        y_test_pred.extend(preds)

    a_test = accuracy_score(y_test_true, y_test_pred)
    
    print("Epoch {} Loss: {:.2f}, Train/Test Accuracy: {:.3}/{:.3f}".format(epoch, total_loss / it, a_train, a_test))
    """



Epoch|it: 0|500, Total Loss: 9.87
Epoch|it: 0|1000, Total Loss: 9.72
Epoch|it: 0|1500, Total Loss: 9.49
Epoch|it: 0|2000, Total Loss: 9.16


KeyboardInterrupt: 

In [182]:
for i in list(input_seq.numpy()):
    print(idx2input[i])

for i in list(output_seq.numpy()):
    print(idx2putput[i])    
    
break_level = 10

model = s2s

preds = []

model.enc.hidden = model.enc.init_hidden()
model.dec.hidden = model.dec.init_hidden()        

enc_output, enc_hidden = model.enc(input_seq)
context = (enc_output[-1].unsqueeze(1), enc_output[-1].unsqueeze(1))

dec_sos = torch.LongTensor([sos_int])

dec_output, hidden = model.dec.forward(dec_sos, context)
pred = torch.argmax(dec_output[-1])
preds.append(int(pred.data.numpy()))

it = 0

while pred != eos_int:
    it += 1
    if it > break_level:
        break
    dec_output, hidden = model.dec.forward(torch.LongTensor([pred]), hidden)
    pred = torch.argmax(dec_output[-1])
    preds.append(int(pred.data.numpy()))

print(preds)

<SOS>
the
eu's
hypocrisy
is
abominable:
it
is
criminalising
undocumented
immigrants.
<EOS>
<SOS>
la
hipocresía
de
la
ue
es
abominable:
criminaliza
al
inmigrante
sin
papeles.
<EOS>
[15310, 15310, 15310, 15310, 15310, 15310, 15310, 15310, 15310, 15310, 15310]




In [145]:
output, hidden = model.dec.forward(torch.LongTensor([15310]), hidden)
torch.argmax(output)



tensor(15310)

In [113]:
sample = torch.LongTensor(test_data[0][0])
sos_int = output2idx['<SOS>']
eos_int = output2idx['<EOS>']
preds = predict(s2s, sample, sos_int, eos_int)
for p in preds:
    print(p)

15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310
15310




In [114]:
eos_int

9101

In [89]:
epochs = 10
s2s.train()
for epoch in range(epochs):
    total_loss = 0
    s2s.train()
    y_test_pred = []
    y_test_true = []
    y_train_pred = []
    y_train_true = []
    for it, example in enumerate(train_data):
        if (it % 500 == 0) and (it != 0):
            print("Epoch|it: {}|{}, Total Loss: {:.2f}".format(epoch, it, total_loss / it))
        input_seq, output_seq = example
        enc_optim.zero_grad()
        dec_optim.zero_grad()

        input_seq = torch.LongTensor(input_seq)
        output_seq = torch.LongTensor(output_seq)    
        res = s2s.forward(input_seq, output_seq)
        loss = criterion(res, torch.LongTensor(output_seq))
        loss.backward()
        total_loss += loss.data.numpy()

        enc_optim.step()
        dec_optim.step()
        
        preds = list(torch.argmax(res, dim=1).data.numpy())
        trues = list(output_seq.data.numpy())
        y_train_true.extend(trues)
        y_train_pred.extend(preds)
        
    a_train = accuracy_score(y_train_true, y_train_pred)
    
    s2s.eval()
    y_test_pred = []
    y_test_true = []

    for example in test_data:

        input_seq, output_seq = example

        input_seq = torch.LongTensor(input_seq)
        output_seq = torch.LongTensor(output_seq)    
        res = s2s.forward(input_seq, output_seq)
        preds = list(torch.argmax(res, dim=1).data.numpy())
        trues = list(output_seq.data.numpy())

        y_test_true.extend(trues)
        y_test_pred.extend(preds)

    a_test = accuracy_score(y_test_true, y_test_pred)
    
    print("Epoch {} Loss: {:.2f}, Train/Test Accuracy: {:.3}/{:.3f}".format(epoch, total_loss / it, a_train, a_test))



KeyboardInterrupt: 

In [333]:
from modules.seq2seq import encoder, decoder, seq2seq
torch.save(s2s, '../data/seq2seq.pt')

  "type " + obj.__name__ + ". It won't be checked "


PicklingError: Can't pickle <class '__main__.seq2seq'>: it's not the same object as __main__.seq2seq