In [1]:
import torchtext
torchtext.__version__

'0.9.0'

## Steps

1. Specify how preprocessing should be done -> `Fields`
2. Use Dataset to load the data -> `TabularDataset` (CSV/JSON)
3. Construct an iterator to do batching & padding -> BucketIterator

### Step 1

In [2]:
import pandas as pd
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train

Unnamed: 0,name,quote,score
0,Jocko,You must own everything in your world. There i...,1
1,Bruce Lee,"Do not pray for an easy life, pray for the str...",1
2,Potato guy,"Stand tall, and rice like a potato!",0


In [4]:
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy

In [6]:
spacy_en = spacy.load('en_core_web_sm')

In [7]:
# a better tokenizer
def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [8]:
# How the data should be processed
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)

In [9]:
score = Field(sequential=False, use_vocab=False)

In [10]:
'''
which column should we use in the dataset
Later on, when we created the batchs, how we get the quote is batch.q, score batch.s
'''

fields = {'quote': ('q', quote), 'score': ('s', score)}

In [11]:
train_data, test_data = TabularDataset.splits(path='./data',
                                              train='train.csv',
                                              test='test.csv',
                                              format='csv',
                                              fields=fields)

In [12]:
train_data[0].__dict__.keys()

dict_keys(['q', 's'])

In [13]:
train_data[0].__dict__.values()

dict_values([['you', 'must', 'own', 'everything', 'in', 'your', 'world', '.', 'there', 'is', 'no', 'one', 'else', 'to', 'blame', '.'], '1'])

### Step 2

Build a vocab

In [19]:
quote.build_vocab(train_data,
                  max_size=1000,
                  min_freq=1,
                  vectors='glove.6B.50d') # 1 GB

.vector_cache/glove.6B.zip: 862MB [1:58:52, 121kB/s]                            
100%|███████████████████████████████▉| 399999/400000 [00:06<00:00, 61165.75it/s]


### Step 3

In [20]:
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=2, device='cpu')

In [21]:
for batch in train_iterator: 
    print(batch.q)
    print(batch.s)

tensor([[29],
        [31],
        [ 3],
        [11],
        [28],
        [22],
        [ 4],
        [27],
        [ 9]])
tensor([0])
tensor([[35, 14],
        [23, 25],
        [26,  7],
        [18,  5],
        [19, 10],
        [36, 15],
        [34, 21],
        [ 2,  3],
        [33,  7],
        [20,  5],
        [24, 32],
        [ 6, 30],
        [16,  8],
        [ 8, 17],
        [12,  4],
        [ 2, 13],
        [ 1,  6],
        [ 1,  2]])
tensor([1, 1])
