## About
The torchtext package consists of data processing utilities and popular datasets for natural language.

Generic steps :
1. Specify how preprocessing should be done -> Fields
2. Use Dataset to load the data -> TabularDataset (JSON/CSV/TSV Files)
3. Construct an iterator to do batching & padding -> BucketIterator

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torchtext.data import Field, TabularDataset, BucketIterator

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Naive Tokenizer
tokenize = lambda x: x.split()

In [6]:
# Better way to tokenize
# python -m spacy download en
spacy_en = spacy.load('en_core_web_sm')


def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


In [7]:
# Create Fileds 
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False)

In [9]:
fields = { 'quote': ('q', quote), 'score': ('s', score)}

In [13]:
# Load tabular data 
train_data, test_data = TabularDataset.splits(
    path="mydata", train="train.json", test="test.json", format="json", fields=fields
)


In [14]:
train_data[0].__dict__.keys(), train_data[0].__dict__.values()

(dict_keys(['q', 's']),
 dict_values([['you', 'must', 'own', 'everything', 'in', 'your', 'world', '.', 'there', 'is', 'no', 'one', 'else', 'to', 'blame', '.'], 1]))

In [21]:
# Build vocab
quote.build_vocab(train_data, max_size=10000, min_freq = 1)

In [16]:
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=2, device=device
)

In [17]:
from tqdm import tqdm
for t_batch in (train_iterator):
    print(t_batch.q)
    print(t_batch.s)
    break


tensor([[29],
        [31],
        [ 3],
        [11],
        [28],
        [22],
        [ 4],
        [27],
        [ 9]], device='cuda:0')
tensor([0], device='cuda:0')


In [18]:
t_batch.q


tensor([[29],
        [31],
        [ 3],
        [11],
        [28],
        [22],
        [ 4],
        [27],
        [ 9]], device='cuda:0')