In [1]:
from load_utils import prepare_data
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from tqdm.notebook import tnrange, tqdm_notebook

In [2]:
q, a, pairs, vector = prepare_data('test', 'glove.42B.300d/glove.42B.300d.txt', small=True)

Reading test -------
Read 4041 sentence pairs
Counting words
Counted words:
In questions: 5087 words
In answers: 394 words


In [3]:
def epoch_train(model, optimizer, batch_size, pairs):
    
    # Set the model in train mode
    model.train()
    
    # Gets number total number of rows for training
    n_records = len(pairs)
    
    # Shuffle the row indexes 
    perm = np.random.permutation(n_records)
    
    st = 0
    
    while st < n_records:
        
        ed = st + batch_size if (st + batch_size) < n_records else n_records
    
        encoder_in, decoder_in = to_batch_sequence(pairs, st, ed, perm, device)

        # Calculate outputs and loss
        output_values, loss = model(encoder_in, decoder_in)
        
        # Clear gradients (pytorch accumulates gradients by default)
        optimizer.zero_grad() 

        # Backpropagation & weight adjustment
        loss.backward()
        optimizer.step()
        
        st = ed

print(f"Optimization ended successfully")

Optimization ended successfully


In [5]:
perm = np.random.permutation(len(pairs))
perm 

array([2523, 2369, 1514, ..., 3609,  571, 2569])

In [88]:
def to_batch_sequence(pairs, st, ed, perm, device):
    
    encoder_in = []
    decoder_in = []
    for i in range(st, ed):
        
        pair_batch = pairs[perm[i]]
        encoder_in.append(pair_batch[0])
        decoder_in.append(pair_batch[1])
    
    encoder_in = [[q.word2index.get(idx) for idx in encoder_in[row].split()] for row in range(len(encoder_in))]
    decoder_in = [[q.word2index.get(idx) for idx in decoder_in[row].split()] for row in range(len(decoder_in))]
    
    encoder_lengths = [len(row) for row in encoder_in]
    decoder_lengths = [len(row) for row in decoder_in]
    
    max_encoder_length = max(encoder_lengths)
    max_decoder_length = max(decoder_lengths)
    
    encoder_in_tensor = torch.zeros(ed, max_encoder_length, device=device, dtype=torch.float)
    decoder_in_tensor = torch.zeros(ed, max_decoder_length, device=device, dtype=torch.float)
    
    for i, seq in enumerate(encoder_in):
        for t, word in enumerate(seq):
            if type(word) == int:
                encoder_in_tensor[i, t] = word 
                
    for i, seq in enumerate(decoder_in):
        for t, word in enumerate(seq):
            if type(word) == int:
                decoder_in_tensor[i, t] = word 
        
    return encoder_in_tensor, decoder_in_tensor

In [9]:
pair_batch = pairs[perm[0]]

In [10]:
pair_batch[0]

''

In [11]:
pair_batch[1]

'i don t know .'

In [89]:
encoder_in, decoder_in = to_batch_sequence(pairs, 0, 32, perm, device='cpu')

In [90]:
encoder_in

tensor([[   0.,    0.,    0.,  ...,    0.,    0.,    0.],
        [  12.,  138.,   14.,  ...,    0.,    0.,    0.],
        [  56.,   49.,   35.,  ...,    0.,    0.,    0.],
        ...,
        [ 101., 1624., 3261.,  ...,    0.,    0.,    0.],
        [ 124.,   16.,   46.,  ...,    0.,    0.,    0.],
        [   0.,    0.,    0.,  ...,    0.,    0.,    0.]])

In [63]:
[len(row) for row in encoder_in]

[0,
 12,
 28,
 10,
 26,
 12,
 4,
 12,
 2,
 8,
 12,
 5,
 20,
 33,
 9,
 12,
 22,
 9,
 6,
 5,
 9,
 17,
 12,
 0,
 5,
 6,
 5,
 0,
 14,
 19,
 21,
 0]

In [68]:
zero = torch.zeros(32, 33, device='cpu', dtype=torch.float)
zero

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [87]:
for i, seq in enumerate(encoder_in):
    for t, word in enumerate(seq):
        if type(word) == int:
            zero[i, t] = word 

zero

tensor([[   0.,    0.,    0.,  ...,    0.,    0.,    0.],
        [  12.,  138.,   14.,  ...,    0.,    0.,    0.],
        [  56.,   49.,   35.,  ...,    0.,    0.,    0.],
        ...,
        [ 101., 1624., 3261.,  ...,    0.,    0.,    0.],
        [ 124.,   16.,   46.,  ...,    0.,    0.,    0.],
        [   0.,    0.,    0.,  ...,    0.,    0.,    0.]])

In [82]:
for i, seq in enumerate(encoder_in):
    for t, word in enumerate(seq):
        print(type(word))

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'NoneType'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'NoneType'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'NoneType'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'NoneType'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
