In [37]:
from torchtext import data

In [54]:
TEXT = data.Field(tokenize="spacy") #by default tokenize simply splits on spaces, this uses the spaCy tokenizer, but must be installed via `conda install spacy`
LABELS = data.Field(pad_token=None, unk_token=None) #don't want the <pad> or <unk> tokens in the labels



    Only loading the 'en' tokenizer.



In [55]:
train, val, test = data.TabularDataset.splits(
    path='datasets/sample_text/', train='train.txt',
    validation='val.txt', test='test.txt', format='csv',
    fields=[('text', TEXT), ('labels', LABELS)])

In [56]:
print(train.fields)

{'text': <torchtext.data.field.Field object at 0x7fb4cc34f7b8>, 'labels': <torchtext.data.field.Field object at 0x7fb5143d5828>}


In [57]:
print(len(train))

250


In [58]:
print(vars(train[0]))

{'text': ['eword1'], 'labels': ['engineering']}


In [59]:
#w/o bucketing

#train_iter, val_iter, test_iter = data.Iterator.splits((train, val, test), #the dataset object to load data from
                                                            #batch_size = 3,
                                                            #device=-1) #which GPU to use, -1 for CPU, None for default GPU

In [60]:
#w/ bucketing

train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), #the dataset object to load data from
                                                            batch_size = 3,
                                                            sort_key=lambda x: len(x.text),
                                                            device=-1) #which GPU to use, -1 for CPU, None for default GPU

In [61]:
TEXT.build_vocab(train.text)
LABELS.build_vocab(train.labels)

In [62]:
TEXT.vocab.freqs

Counter({'bword1': 25,
         'bword2': 25,
         'bword3': 25,
         'bword4': 25,
         'bword5': 25,
         'eword1': 25,
         'eword2': 25,
         'eword3': 25,
         'eword4': 25,
         'eword5': 25})

In [63]:
TEXT.vocab.freqs.most_common(10)

[('eword1', 25),
 ('eword2', 25),
 ('eword3', 25),
 ('eword4', 25),
 ('eword5', 25),
 ('bword1', 25),
 ('bword2', 25),
 ('bword3', 25),
 ('bword4', 25),
 ('bword5', 25)]

In [64]:
TEXT.vocab.itos

['<unk>',
 '<pad>',
 'bword1',
 'bword2',
 'bword3',
 'bword4',
 'bword5',
 'eword1',
 'eword2',
 'eword3',
 'eword4',
 'eword5']

In [49]:
TEXT.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'1': 8,
             '2': 9,
             '3': 10,
             '4': 11,
             '5': 12,
             '<pad>': 1,
             '<unk>': 0,
             'b': 6,
             'd': 2,
             'e': 7,
             'o': 3,
             'r': 4,
             'w': 5})

In [50]:
print(LABELS.vocab.itos)
print(LABELS.vocab.stoi)

['biology', 'engineering']
defaultdict(<function _default_unk_index at 0x7fb4d3a68488>, {'biology': 0, 'engineering': 1})


In [65]:
batch = next(iter(train_iter))
print(batch.text)
print(batch.labels)

Variable containing:
 7  4  2
[torch.LongTensor of size 1x3]

Variable containing:
 1  0  0
[torch.LongTensor of size 1x3]



In [52]:
batch = next(iter(train_iter))
print(batch.text)
print(batch.labels)

Variable containing:
  7   6   7
  5   5   5
  3   3   3
  4   4   4
  2   2   2
  8   8  12
[torch.LongTensor of size 6x3]

Variable containing:
 1  0  1
[torch.LongTensor of size 1x3]



In [53]:
batch = next(iter(train_iter))
print(batch.text)
print(batch.labels)

Variable containing:
  6   7   6
  5   5   5
  3   3   3
  4   4   4
  2   2   2
  9  12   8
[torch.LongTensor of size 6x3]

Variable containing:
 0  1  0
[torch.LongTensor of size 1x3]

