In [9]:
# STEPS:

# 1. Specify how preprocessing should be done -> Fields
# 2. Use Dataset to load the data -> TabularDataset (JSON/CSV/TSV Files)
# 3. Construct an iterator to do batching & padding -> BucketIterator

In [10]:
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy

In [11]:
spacy_en = spacy.load('en')

In [None]:
def tokenize(text):
    return [tok.text for tok in space_en.tokenizer(text)]

In [12]:
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False)

In [13]:
fields = {'quote': ('q', quote), 'score': ('s', score)}

In [14]:
train_data, test_data = TabularDataset.splits(
    path='mydata',
    train='train.json',
    test='test.json',
    #validation='validation.json'
    format='json',
    fields=fields
)

# train_data, test_data = TabularDataset.splits(
#     path='mydata',
#     train='train.csv',
#     test='test.csv',
#     format='csv',
#     fields=fields
# )

# train_data, test_data = TabularDataset.splits(
#     path='mydata',
#     train='train.tsv',
#     test='test.tsv',
#     format='json',
#     fields=fields
# )

# print(train_data[0].__dict__.keys())
# print(train_data[0].__dict__.values())

In [15]:
quote.build_vocab(train_data,
                 max_size=10000,
                 min_freq=1,
                 vectors='glove.6B.100d') # 1 GB

In [18]:
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=2,
    device="cuda"
)

for batch in train_iterator:
    print(batch.q)
    print(batch.s)

tensor([[27, 10],
        [29, 21],
        [ 7,  4],
        [26,  3],
        [18,  6],
        [ 2, 11],
        [25, 17],
        [ 1,  4],
        [ 1,  3],
        [ 1, 30],
        [ 1, 28],
        [ 1,  5],
        [ 1, 13],
        [ 1,  2],
        [ 1,  9],
        [ 1, 23]], device='cuda:0')
tensor([0, 1], device='cuda:0')
tensor([[33],
        [19],
        [24],
        [14],
        [15],
        [34],
        [32],
        [31],
        [16],
        [20],
        [22],
        [12],
        [ 5],
        [ 8]], device='cuda:0')
tensor([1], device='cuda:0')
