In [131]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torch.utils.data import DataLoader, Subset

import matplotlib.pyplot as plt
from IPython import display
display.set_matplotlib_formats('svg')

import warnings
warnings.filterwarnings('ignore')

In [19]:
from torch.utils.data.dataset import random_split
torch.manual_seed(1)

<torch._C.Generator at 0x12bbe09b0>

In [205]:
from datasets import load_dataset

train_dataset = load_dataset("imdb", split="train")
test_dataset = load_dataset("imdb", split="test")

trainset = Subset(train_dataset, range(0, 20000))
validset = Subset(train_dataset, range(20000, 25000))



In [153]:
len(trainset), len(validset)

(20000, 5000)

In [161]:
print(validset[4999])

{'text': 'The story centers around Barry McKenzie who must go to England if he wishes to claim his inheritance. Being about the grossest Aussie shearer ever to set foot outside this great Nation of ours there is something of a culture clash and much fun and games ensue. The songs of Barry McKenzie(Barry Crocker) are highlights.', 'label': 1}


In [None]:
import re
from collections import Counter, OrderedDict

In [162]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall(
    '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
    ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

In [163]:
token_counts = Counter()
for line in trainset:
    tokens = tokenizer(line['text'])
    token_counts.update(tokens)

In [112]:
train_dataset.dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [164]:
print('Vocab-size:', len(token_counts))

Vocab-size: 68745


In [171]:
import torchtext.vocab 

sorted_by_freq_tuples = sorted(
    token_counts.items(), key=lambda x: x[1], reverse = True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

In [244]:
print(len(ordered_dict))

68745


In [201]:
vocab = vocab(sorted_by_freq_tuples)

68745lines [00:00, 198338.81lines/s]


TypeError: '<' not supported between instances of 'int' and 'str'

In [200]:
from torchtext.vocab import build_vocab_from_iterator as vocab

In [179]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[0, 0, 0, 0]


In [193]:
print(vocab['and'])

0


In [198]:
print(ordered_dict.get('the'))

266596


In [202]:
from torchtext.vocab import defaultdict

In [203]:
from transformers import AutoTokenizer

In [204]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [207]:
encoded_input = tokenizer(train_dataset[0], padding=True)

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [296]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [354]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors='pt')

In [355]:
tokenized_datasets = train_dataset.map(tokenize_function, batched=True)



In [356]:
tokenized_datasets

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [357]:
tokenized_datasets.set_format("torch")

In [358]:
tokenized_datasets.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [362]:
print((tokenized_datasets[:1]['label'][0].float()).dtype)

torch.float32


In [361]:
tokenized_datasets['label'] = tokenized_datasets['label'].float()

TypeError: 'Dataset' object does not support item assignment

In [240]:
train_dl.dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [324]:
batch_size = 32
train_dl = DataLoader(tokenized_datasets, batch_size=batch_size, shuffle=True)

In [250]:
X, y, l, k, z = next(iter(train_dl))
print(X, y)

text label


In [226]:
train_dl.dataset[:2]['text']

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [227]:
embedding = nn.Embedding(
    num_embeddings = 10,
    embedding_dim = 3,
    padding_idx = 0)
text_encoded_input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 0]])
print(embedding(text_encoded_input))

tensor([[[-1.6755, -1.6676,  1.2994],
         [ 2.0843,  0.2214, -0.2201],
         [ 0.2629, -0.7579, -0.6529],
         [ 0.3606, -1.2950, -0.0311]],

        [[ 0.2629, -0.7579, -0.6529],
         [ 1.3744, -2.3288, -1.4609],
         [ 2.0843,  0.2214, -0.2201],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [230]:
class Rnn(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        #self.rnn = nn.LSTM(input_size, hidden_size, num_layers=2, batch_first=True)
        #self.rnn = nn.GRU(input_size, hidden_size, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        print(out.shape, out)
        x = self.fc(out)
        return x

model = Rnn(64, 32)
print(model)
model(torch.randn(5, 3, 64))
        

Rnn(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)
torch.Size([5, 32]) tensor([[-0.2105,  0.4410, -0.6201,  0.2708, -0.2334, -0.5282,  0.2158, -0.0867,
         -0.0335, -0.3827,  0.1983, -0.1298,  0.2245,  0.7044, -0.8089, -0.1554,
          0.4825, -0.0040, -0.2078, -0.0589, -0.5860,  0.6425,  0.7338,  0.1732,
          0.4231, -0.0920, -0.4155, -0.5559, -0.3367, -0.3995,  0.6020,  0.4761],
        [ 0.1690, -0.1267,  0.3722, -0.7329,  0.1795,  0.1470, -0.5854,  0.2531,
          0.0532, -0.0901,  0.0494,  0.2562, -0.2022,  0.3411,  0.2734, -0.0135,
          0.4384,  0.2889, -0.0985,  0.1023, -0.4628,  0.2223,  0.4363,  0.3439,
          0.2239,  0.2290,  0.1681, -0.3073, -0.5171, -0.0864,  0.2057,  0.5356],
        [ 0.5015,  0.3651, -0.5262,  0.0329, -0.2693, -0.1585, -0.0616,  0.0425,
         -0.3679,  0.6931, -0.2032, -0.1007,  0.5572,  0.8043, -0.3500, -0.2360,
          0.7649,  0.5358, -0.6805, -0.3419,  0.31

tensor([[-0.4998],
        [-0.0959],
        [-0.1825],
        [-0.1297],
        [-0.0831]], grad_fn=<AddmmBackward0>)

In [328]:
class Rnn(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim , padding_idx = 0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first = True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text):
        out = self.embedding(text)
        #out = nn.utils.rnn.pack_padded_sequence( 
        #    out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out =  hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [329]:
vocab_size = len(ordered_dict)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
model = Rnn(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
print(model)

Rnn(
  (embedding): Embedding(68745, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [330]:
train_dl.dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [375]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for l in dataloader:
        pred = model(l['input_ids'])[:,0]
        loss = loss_fn(pred, l['label'].float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #total_acc += ((pred >= 0.5).float() == label_batch).sum().item()
        total_acc = 0
        total_loss += loss.item()*l['label'].size(0)
        
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [376]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [381]:
num_epochs = 10
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    print(f'Epoch {epoch} loss: {loss_train:.4f}')

Epoch 0 loss: 0.6911
Epoch 1 loss: 0.6854
Epoch 2 loss: 0.6842
Epoch 3 loss: 0.6821
Epoch 4 loss: 0.6696


KeyboardInterrupt: 

In [280]:
train_dl.dataset[0]['input_ids']

[101,
 146,
 12765,
 146,
 6586,
 140,
 19556,
 19368,
 13329,
 118,
 162,
 21678,
 2162,
 17056,
 1121,
 1139,
 1888,
 2984,
 1272,
 1104,
 1155,
 1103,
 6392,
 1115,
 4405,
 1122,
 1165,
 1122,
 1108,
 1148,
 1308,
 1107,
 2573,
 119,
 146,
 1145,
 1767,
 1115,
 1120,
 1148,
 1122,
 1108,
 7842,
 1118,
 158,
 119,
 156,
 119,
 10148,
 1191,
 1122,
 1518,
 1793,
 1106,
 3873,
 1142,
 1583,
 117,
 3335,
 1217,
 170,
 5442,
 1104,
 2441,
 1737,
 107,
 6241,
 107,
 146,
 1541,
 1125,
 1106,
 1267,
 1142,
 1111,
 1991,
 119,
 133,
 9304,
 120,
 135,
 133,
 9304,
 120,
 135,
 1109,
 4928,
 1110,
 8663,
 1213,
 170,
 1685,
 3619,
 3362,
 2377,
 1417,
 14960,
 1150,
 3349,
 1106,
 3858,
 1917,
 1131,
 1169,
 1164,
 1297,
 119,
 1130,
 2440,
 1131,
 3349,
 1106,
 2817,
 1123,
 2209,
 1116,
 1106,
 1543,
 1199,
 3271,
 1104,
 4148,
 1113,
 1184,
 1103,
 1903,
 156,
 11547,
 1162,
 1354,
 1164,
 2218,
 1741,
 2492,
 1216,
 1112,
 1103,
 4357,
 1414,
 1105,
 1886,
 2492,
 1107,
 1103,
 1244,
 13

In [312]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input)

{'input_ids': tensor([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,  6462,
           117, 21902,  1643,   119,   102],
        [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [313]:
tokenized_datasets.set_format("torch")