In [2]:
!pip install torchtext==0.6.0
!python -m spacy download en_core_web_sm

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/64.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchtext==0.6.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchtext==0.6.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchtext==0.6.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->t

In [46]:
import torch
import torch.nn as nn
import torchtext.data as ttd
import torchtext
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import spacy

In [47]:
#Let's make some fake data!
data = {
    "label": [0, 1, 1],
    "data":[
        "I like eggs and ham.",
        "Eggs i like!",
        "Ham and eggs or just ham?"
    ]
}

In [48]:
df = pd.DataFrame(data)

In [49]:
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs i like!
2,1,Ham and eggs or just ham?


In [50]:
df.to_csv('thedata.csv', index=False)

In [51]:
!head thedata.csv

label,data
0,I like eggs and ham.
1,Eggs i like!
1,Ham and eggs or just ham?


In [54]:
spacy_en = spacy.load('en_core_web_sm')

def tokenize(text):
    return [token.text for token in spacy_en.tokenizer(text)]


# Define the TorchText fields
TEXT = ttd.Field(
    sequential=True,
    batch_first=True,
    tokenize= tokenize,  # Use spaCy for tokenization
    lower=True,
    pad_first=True
)
LABEL = ttd.Field(sequential=False, use_vocab=False, is_target=True)

# Load the dataset with TorchText
dataset = ttd.TabularDataset(
    path='thedata.csv',
    format='csv',
    skip_header=True,
    fields=[('label', LABEL), ('data', TEXT)]
)



In [55]:
ex = dataset.examples[0]

In [56]:
type(ex)

torchtext.data.example.Example

In [57]:
ex.data

['i', 'like', 'eggs', 'and', 'ham', '.']

In [58]:
ex.label

'0'

In [59]:
train_dataset, test_dataset = dataset.split(0.66)

In [60]:
TEXT.build_vocab(train_dataset,)

In [61]:
vocab = TEXT.vocab
type(vocab)

In [62]:
torchtext.vocab.Vocab

In [63]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7dd132fd2ce0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'eggs': 2,
             'ham': 3,
             '!': 4,
             '?': 5,
             'and': 6,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [64]:
vocab.itos

['<unk>', '<pad>', 'eggs', 'ham', '!', '?', 'and', 'i', 'just', 'like', 'or']

In [65]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [66]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset), sort_key = lambda x: len(x.data),
    batch_sizes = (2, 2), device = device)

In [67]:
for inputs, targets in train_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break

inputs: tensor([[ 1,  1,  1,  2,  7,  9,  4],
        [ 3,  6,  2, 10,  8,  3,  5]], device='cuda:0') shape: torch.Size([2, 7])
targets: tensor([1, 1], device='cuda:0') shape: torch.Size([2])


In [68]:
for inputs, targets in test_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break

inputs: tensor([[7, 9, 2, 6, 3, 0]], device='cuda:0') shape: torch.Size([1, 6])
targets: tensor([0], device='cuda:0') shape: torch.Size([1])
