In [None]:
import torch
from torchtext.legacy import data, datasets
import random

In [None]:
seed = 966
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


**Fields**

[Check documentation](https://pytorch.org/text/_modules/torchtext/data/field.html)

In [None]:
TEXT = data.Field(tokenize='spacy', lower=True)
LABEL = data.LabelField()

**Text REtrieval Conference (TREC) Question Classification Dataset**

*Data Examples and Six Categories:*

| Text | Label | Category |
| --- | --- | --- |
|CNN is the abbreviation for what ?|ABBR| ABBREVIATION |
| What is the date of Boxing Day ? | NUM |NUMERIC|
|Who discovered electricity ?| HUM |HUMAN|
|What 's the colored part of the eye called ?|ENTY|ENTITY|
|Why do horseshoes bring luck ?|DESC|DESCRIPTION|
|What is California 's capital ?|LOC|LOCATION|

In [None]:
train, test = datasets.TREC.splits(TEXT, LABEL)
train, val = train.split(random_state = random.seed(seed))

downloading train_5500.label


100%|██████████| 336k/336k [00:00<00:00, 2.94MB/s]


downloading TREC_10.label


100%|██████████| 23.4k/23.4k [00:00<00:00, 1.07MB/s]


In [None]:
vars(train[-1])

{'label': 'NUM',
 'text': ['how', 'fast', 'does', 'the', 'fastest', 'car', 'go', '?']}

In [None]:
TEXT.build_vocab(train, min_freq=2)
LABEL.build_vocab(train)

In [None]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ENTY': 0, 'HUM': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5})


In [None]:
print("Vocabulary size of TEXT:",len(TEXT.vocab.stoi))
print("Vocabulary size of LABEL:",len(LABEL.vocab.stoi))

Vocabulary size of TEXT: 2679
Vocabulary size of LABEL: 6


In [None]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test),
    batch_size = 64,
    sort_key=lambda x: len(x.text), 
    device=device
)