In [7]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
from torch import nn
import spacy
import numpy as np

In [8]:
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [9]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [10]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=150)
LABEL = data.Field(sequential=False, use_vocab=True)

In [11]:
pos = data.TabularDataset(
    path='./IMDB Dataset.csv', format='csv',skip_header = True,
    fields=[('text', TEXT),
        ('labels', LABEL)])

In [12]:
TEXT.build_vocab(pos, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(pos)

In [13]:
print(len(pos))

50000


In [14]:
train_iter = data.BucketIterator(pos,batch_size=500, shuffle = True)

In [15]:
print(type(train_iter))

<class 'torchtext.data.iterator.BucketIterator'>


In [16]:
batch = next(iter(train_iter))

print((batch))
print(batch.text)
print(batch.labels)


[torchtext.data.batch.Batch of size 500]
	[.text]:[torch.LongTensor of size 150x500]
	[.labels]:[torch.LongTensor of size 500]
tensor([[  13,   13,    2,  ...,   22, 2710,   12],
        [  25, 2099,   23,  ...,    6,    8,   94],
        [  19,    9,    9,  ...,  671,    2, 1161],
        ...,
        [ 935,   48,    1,  ...,    3,    2,   62],
        [  44,  168,    1,  ...,    9,  692, 2827],
        [ 337,    3,    1,  ...,   63,   42,    4]])
tensor([1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1,
        2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2,
        2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1,
        2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2,
        1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2,
        2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1

In [20]:
# Embedding layer
emb_dim = 100
vocab = TEXT.vocab
print(len(vocab))
embed = nn.Embedding(len(vocab), emb_dim)
#embed.weight.data.copy_(vocab.vectors)

146591


In [26]:
print(type(embed), embed.weight, len(embed.weight[0]))

<class 'torch.nn.modules.sparse.Embedding'> Parameter containing:
tensor([[-0.4102, -1.8895, -1.7509,  ..., -1.4829,  0.1591,  0.9026],
        [-0.0508, -1.4018, -1.0173,  ..., -0.0605,  0.5727,  0.6892],
        [ 0.2515,  2.5254, -0.9704,  ..., -1.4259, -0.6374,  1.4937],
        ...,
        [-0.8543, -1.0138,  1.1297,  ..., -0.3903, -2.5466,  0.7747],
        [ 0.7644,  0.9296,  1.3523,  ...,  1.0072,  0.3755, -0.4296],
        [-0.4578, -0.1371, -0.7040,  ..., -0.7005, -0.4511, -0.2482]],
       requires_grad=True) 100
