In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import torch
import torch.nn as nn
import torchtext
import torchtext.data as ttd

In [2]:
data = {
    "label": [0, 1, 1], 
    "data": [
        "I like eggs and ham.", 
        "Eggs I like!", 
        "Ham and eggs or just ham?",
    ]
}

In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [4]:
df.to_csv("thedata.csv", index=False)

In [5]:
TEXT = ttd.Field(sequential=True, 
                use_vocab=True, 
                lower=True, 
                tokenize="spacy", 
                batch_first=True, 
                pad_first=True)
LABEL = ttd.Field(sequential=False, 
                 use_vocab=False,
                 is_target=True)



In [6]:
dataset = ttd.TabularDataset(path="thedata.csv", 
                            format="csv", 
                            fields=[("label", LABEL), ("data", TEXT)], 
                            skip_header=True)



In [7]:
ex = dataset.examples[0]

In [8]:
ex.data

['i', 'like', 'eggs', 'and', 'ham', '.']

In [9]:
ex.label

'0'

In [10]:
type(ex)

torchtext.data.example.Example

In [11]:
train_dataset, test_dataset = dataset.split(split_ratio=0.66)

In [12]:
TEXT.build_vocab(train_dataset,)

In [13]:
vocab = TEXT.vocab

In [14]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x00000144FA8DC0D0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'eggs': 2,
             'i': 3,
             'like': 4,
             '!': 5,
             '.': 6,
             'and': 7,
             'ham': 8})

In [15]:
vocab.itos

['<unk>', '<pad>', 'eggs', 'i', 'like', '!', '.', 'and', 'ham']

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [17]:
train_iter, test_iter = ttd.Iterator.splits(datasets=(train_dataset, test_dataset), 
                                           batch_sizes=(2, 2), 
                                           sort_key=lambda x: len(x.data), 
                                           device=device)



In [18]:
for inputs, targets in train_iter:
    print("inputs: ", inputs, "shape: ", inputs.shape)
    print("targets: ", targets, "shape: ", targets.shape)
    break

inputs:  tensor([[3, 4, 2, 7, 8, 6],
        [1, 1, 2, 3, 4, 5]]) shape:  torch.Size([2, 6])
targets:  tensor([0, 1]) shape:  torch.Size([2])




In [19]:
for inputs, targets in test_iter:
    print("inputs: ", inputs, "shape: ", inputs.shape)
    print("targets: ", targets, "shape: ", targets.shape)
    break

inputs:  tensor([[8, 7, 2, 0, 0, 8, 0]]) shape:  torch.Size([1, 7])
targets:  tensor([1]) shape:  torch.Size([1])
