In [84]:
import numpy as np
import pandas as pd
import re

In [85]:
pattern = re.compile('[^a-z +]')
with open('data/multi30k/train.en','r') as f:
    sentences = pd.Series([pattern.sub(' ',x.lower()) for x in f.readlines()])
sentences.head()

0    two young  white males are outside near many b...
1    several men in hard hats are operating a giant...
2     a little girl climbing into a wooden playhouse  
3    a man in a blue shirt is standing on a ladder ...
4            two men are at the stove preparing food  
dtype: object

In [86]:
word_set = set([])
for sent in sentences:
    word_set |= set([x.strip() for x in sent.split(' ')])
all_words = sorted(list(word_set))
id2word_dict = dict(zip(range(len(all_words)),all_words))
id2word_dict[len(all_words)] = 'Ø'
word2id_dict = {word: id for id, word in id2word_dict.items()}

In [97]:
def get_word_IDs(sentence):
    IDs = []
    for word in sentence.split(' '):
        try:
            IDs.append( word2id_dict[word] )
        except:
            IDs.append( word2id_dict['Ø'] )
    return np.asarray(IDs, dtype=np.int32)

In [98]:
Sentences_as_ints = pd.Series([get_word_IDs(sent) for sent in sentences])
Sentences_as_ints.head()

0    [9027, 9666, 0, 9481, 5028, 319, 5775, 5503, 5...
1    [7384, 5190, 4245, 3896, 3916, 319, 5701, 1, 3...
2     [1, 4870, 3585, 1657, 4360, 1, 9570, 6253, 0, 0]
3    [1, 5032, 4245, 1, 878, 7473, 4387, 8090, 5677...
4    [9027, 5190, 319, 407, 8640, 8202, 6431, 3341,...
dtype: object

In [99]:
from rush import vecs

In [100]:
variable_length_int_vecs = vecs.IntVecs(Sentences_as_ints.tolist())

29000

In [109]:
minibatchsize = 128
for i in range(5):
    minibatch = variable_length_int_vecs.make_padded_minibatch(
        np.random.randint(0,variable_length_int_vecs.num_vecs, minibatchsize), 
        fill_value = word2id_dict['Ø']
    )
    print(minibatch)


 9027  4870  4561  ...   9688  9688  9688
 9027  3588  4077  ...   9688  9688  9688
    1  4870   373  ...   9688  9688  9688
       ...          ⋱          ...       
    1  5032  9547  ...   9688  9688  9688
    1  4026  4461  ...   9688  9688  9688
 9027  9561    68  ...   9688  9688  9688
[torch.IntTensor of size 128x31]


 8650  4387     1  ...   9688  9688  9688
    1  9560  6938  ...   9688  9688  9688
 9027  4267   319  ...   9688  9688  9688
       ...          ⋱          ...       
    1  4870   984  ...   9688  9688  9688
    1  3527  5651  ...   9688  9688  9688
    1  9560  4245  ...   9688  9688  9688
[torch.IntTensor of size 128x25]


 8814  8502     1  ...   9688  9688  9688
  218  5669  2017  ...   9688  9688  9688
    1  4870   984  ...   9688  9688  9688
       ...          ⋱          ...       
    1  5032  4387  ...   9688  9688  9688
 8650  4387     1  ...   9688  9688  9688
 9327   355   222  ...   9688  9688  9688
[torch.IntTensor of size 128x27]


 9445  3585 

In [108]:
%timeit variable_length_int_vecs.make_padded_minibatch( np.random.randint(0,variable_length_int_vecs.num_vecs, minibatchsize), fill_value = word2id_dict['Ø'])

17.5 µs ± 119 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
