In [1]:
import functools

import datasets

import torchtext
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
seed = 0

torch.manual_seed(seed)

<torch._C.Generator at 0x7f31d8064b10>

In [3]:
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

Reusing dataset imdb (/home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [4]:
train_data, test_data

(Dataset({
     features: ['label', 'text'],
     num_rows: 25000
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 25000
 }))

In [5]:
train_data[0]

{'label': 1,
 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'}

In [6]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [7]:
def tokenize_data(example, tokenizer, max_length):
    tokens = {'tokens': tokenizer(example['text'])}
    return tokens

In [8]:
max_length = 256

train_data = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})
test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length})

Loading cached processed dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-7842cabd40e75acb.arrow
Loading cached processed dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-bfe5116f071dbe09.arrow


In [9]:
train_data

Dataset({
    features: ['label', 'text', 'tokens'],
    num_rows: 25000
})

In [10]:
train_data[0]

{'label': 1,
 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
 'tokens': ['bromwell',
  'high',
  'is',
  'a',
  'cartoon',
  'comedy',
  '.',
  'it',
  'ran',
  'at',
  'the',
  'same',
  'time',
  'as',
  'some',
  'other',


In [11]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

Loading cached split indices for dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-8accbbb83db0be7f.arrow and /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-5ccc1cf561f81a9c.arrow


In [12]:
train_data[0]

{'label': 1,
 'text': "Made in 1946 and released in 1948, The Lady and Shanghai was one of the big films made by Welles after returning from relative exile for making Citizen Kane. Dark, brooding and expressing some early Cold War paranoia, this film stands tall as a Film-Noir crime film. The cinematography of this film is filled with Welles' characteristic quirks of odd angles, quick cuts, long pans and sinister lighting. The use of ambient street music is a precursor to the incredible long opening shot in Touch of Evil, and the mysterious Chinese characters and the sequences in Chinatown can only be considered as the inspiration, in many ways, to Roman Polanski's Chinatown. Unfortunately, it is Welles' obsession with technical filmmaking that hurts this film in its entirety. The plot of this story is often lost behind a sometimes incomprehensible clutter of film techniques.<br /><br />However, despite this criticism, the story combined with wonderful performances by Welles, Hayworth 

In [13]:
len(train_data), len(valid_data), len(test_data)

(18750, 6250, 25000)

In [14]:
min_freq = 5
special_tokens = ['<unk>', '<pad>']

vocab = torchtext.vocab.build_vocab_from_iterator(train_data['tokens'],
                                                  min_freq=min_freq,
                                                  specials=special_tokens)

In [15]:
len(vocab)

26232

In [16]:
vocab.get_itos()[:10]

['<unk>', '<pad>', 'the', '.', ',', 'and', 'a', 'of', 'to', "'"]

In [17]:
unk_index = vocab['<unk>']

unk_index

0

In [18]:
pad_index = vocab['<pad>']

pad_index

1

In [19]:
'some_token' in vocab

False

In [20]:
vocab.set_default_index(unk_index)

In [21]:
vocab['some_token']

0

In [22]:
def numericalize_data(example, vocab):
    ids = {'ids': [vocab[token] for token in example['tokens']]}
    return ids

In [23]:
train_data = train_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
valid_data = valid_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
test_data = test_data.map(numericalize_data, fn_kwargs={'vocab': vocab})

Loading cached processed dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-0711fb1236178dd5.arrow
Loading cached processed dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-50be229d07f157d2.arrow
Loading cached processed dataset at /home/ben/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-e956c9a4a1d5a9aa.arrow


In [24]:
train_data[0]

{'label': 1,
 'text': "Made in 1946 and released in 1948, The Lady and Shanghai was one of the big films made by Welles after returning from relative exile for making Citizen Kane. Dark, brooding and expressing some early Cold War paranoia, this film stands tall as a Film-Noir crime film. The cinematography of this film is filled with Welles' characteristic quirks of odd angles, quick cuts, long pans and sinister lighting. The use of ambient street music is a precursor to the incredible long opening shot in Touch of Evil, and the mysterious Chinese characters and the sequences in Chinatown can only be considered as the inspiration, in many ways, to Roman Polanski's Chinatown. Unfortunately, it is Welles' obsession with technical filmmaking that hurts this film in its entirety. The plot of this story is often lost behind a sometimes incomprehensible clutter of film techniques.<br /><br />However, despite this criticism, the story combined with wonderful performances by Welles, Hayworth 

In [25]:
train_data = train_data.with_format(type='torch', columns=['ids', 'label'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'label'])
test_data = test_data.with_format(type='torch', columns=['ids', 'label'])

Same thing as `set_format`, but not in-place.

In [26]:
train_data[0]

{'label': tensor(1),
 'ids': tensor([  100,    12,  7342,     5,   625,    12,  7079,     4,     2,   767,
             5,  5863,    17,    35,     7,     2,   205,   114,   100,    40,
          2123,   107,  3396,    43,  3393, 15072,    20,   258,  3734,  3415,
             3,   475,     4,  6163,     5,  9175,    56,   400,  1131,   350,
          4475,     4,    14,    23,  1375,  3863,    18,     6, 12608,   852,
            23,     3,     2,   606,     7,    14,    23,    10,  1121,    19,
          2123,     9,  7363,  9030,     7,  1031,  2460,     4,  1599,  1996,
             4,   217,  7583,     5,  2914,  1473,     3,     2,   362,     7,
         14835,   867,   232,    10,     6, 11870,     8,     2,  1039,   217,
           657,   326,    12,  1190,     7,   459,     4,     5,     2,  1289,
          1766,   111,     5,     2,   873,    12, 10531,    58,    70,    34,
          1166,    18,     2,  2967,     4,    12,   117,   751,     4,     8,
          4225,  3423,  

Use `output_all_columns=True` to keep non-converted columns.

In [27]:
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        # text = [batch size, seq len]
        embedded = self.embedding(text)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(pooled)
        # prediction = [batch size, output dim]
        return prediction

In [28]:
vocab_size = len(vocab)
embedding_dim = 300
output_dim = len(train_data.unique('label'))

model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)

In [29]:
vectors = torchtext.vocab.FastText()

In [30]:
hello_vector = vectors.get_vecs_by_tokens('hello')

In [31]:
hello_vector.shape

torch.Size([300])

In [32]:
hello_vector

tensor([-1.5945e-01, -1.8259e-01,  3.3443e-02,  1.8813e-01, -6.7903e-02,
        -1.3663e-01, -2.5559e-01,  1.1000e-01,  1.7275e-01,  5.1971e-02,
        -2.3302e-02,  3.8866e-02, -2.4515e-01, -2.1588e-01,  3.5925e-01,
        -8.2526e-02,  1.2176e-01, -2.6775e-01,  1.0072e-01, -1.3639e-01,
        -9.2658e-02,  5.1837e-01,  1.7736e-01,  9.4878e-02, -1.8461e-01,
        -4.2829e-02,  1.4114e-02,  1.6811e-01, -1.8565e-01,  3.4976e-02,
        -1.0293e-01,  1.7954e-01, -5.2766e-02,  7.2047e-02, -4.2704e-01,
        -1.1616e-01, -9.4875e-03,  1.4199e-01, -2.2782e-01, -1.7292e-02,
         8.2802e-02, -4.4512e-01, -7.5935e-02, -1.4392e-01, -8.2461e-02,
         2.0123e-01, -9.5344e-02, -1.1042e-01, -4.6817e-01,  2.0362e-01,
        -1.7140e-01, -4.9850e-01,  2.8963e-01, -1.0305e-01,  2.0393e-01,
         5.2971e-01, -2.5396e-01, -5.1891e-01,  2.9941e-01,  1.7933e-01,
         3.0683e-01,  2.5828e-01, -1.8168e-01, -1.0225e-01, -1.1435e-01,
        -1.6304e-01, -1.2424e-01,  3.2814e-01, -2.3

In [33]:
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

In [34]:
pretrained_embedding.shape

torch.Size([26232, 300])

In [35]:
model.embedding.weight

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506,  ...,  0.8200, -0.6332,  1.2948],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1483,  2.4187,  1.3279,  ..., -1.0328,  1.1305, -0.5703],
        ...,
        [-0.2703, -0.1223,  0.1723,  ...,  1.0298, -0.4671,  1.5620],
        [-0.4065, -1.0677,  0.3959,  ..., -0.0393, -1.2843, -1.2270],
        [-0.3655, -0.0831,  0.9841,  ...,  1.5278, -0.3701,  0.7942]],
       requires_grad=True)

In [36]:
pretrained_embedding

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0653, -0.0930, -0.0176,  ...,  0.1664, -0.1308,  0.0354],
        ...,
        [-0.3563,  0.1529, -0.6328,  ...,  0.2229,  0.8131, -0.2988],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1161, -0.0390,  0.1120,  ...,  0.0925, -0.1058,  0.5641]])

In [37]:
model.embedding.weight.data = pretrained_embedding

In [38]:
model.embedding.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0653, -0.0930, -0.0176,  ...,  0.1664, -0.1308,  0.0354],
        ...,
        [-0.3563,  0.1529, -0.6328,  ...,  0.2229,  0.8131, -0.2988],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1161, -0.0390,  0.1120,  ...,  0.0925, -0.1058,  0.5641]],
       requires_grad=True)

In [39]:
optimizer = optim.Adam(model.parameters())

In [40]:
criterion = nn.CrossEntropyLoss()

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

In [42]:
model = model.to(device)
criterion = criterion.to(device)

In [43]:
def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_labels = [i['label'] for i in batch]
    batch_labels = torch.stack(batch_labels)
    batch = {'ids': batch_ids,
             'label': batch_labels}
    return batch

In [44]:
batch_size = 512

collate = functools.partial(collate, pad_index=pad_index)

train_dataloader = torch.utils.data.DataLoader(train_data, 
                                               batch_size=batch_size, 
                                               collate_fn=collate, 
                                               shuffle=True)

valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

In [45]:
def train(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_loss = 0
    epoch_accuracy = 0

    for batch in dataloader:
        tokens = batch['ids'].to(device)
        label = batch['label'].to(device)
        prediction = model(tokens)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()

    return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)

In [46]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0

    with torch.no_grad():
        for batch in dataloader:
            tokens = batch['ids'].to(device)
            label = batch['label'].to(device)
            prediction = model(tokens)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()

    return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)

In [47]:
def get_accuracy(prediction, label):
    batch_size = prediction.shape[0]
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [48]:
n_epochs = 10
best_valid_loss = float('inf')

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'nbow.pt')
    
    print(f'epoch: {epoch+1}')
    print(f'train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}')
    print(f'valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}')

epoch: 1
train_loss: 0.690, train_acc: 0.565
valid_loss: 0.686, valid_acc: 0.635
epoch: 2
train_loss: 0.678, train_acc: 0.667
valid_loss: 0.671, valid_acc: 0.698
epoch: 3
train_loss: 0.654, train_acc: 0.721
valid_loss: 0.644, valid_acc: 0.728
epoch: 4
train_loss: 0.619, train_acc: 0.753
valid_loss: 0.611, valid_acc: 0.754
epoch: 5
train_loss: 0.579, train_acc: 0.784
valid_loss: 0.577, valid_acc: 0.778
epoch: 6
train_loss: 0.538, train_acc: 0.810
valid_loss: 0.541, valid_acc: 0.802
epoch: 7
train_loss: 0.497, train_acc: 0.835
valid_loss: 0.509, valid_acc: 0.822
epoch: 8
train_loss: 0.459, train_acc: 0.856
valid_loss: 0.481, valid_acc: 0.837
epoch: 9
train_loss: 0.425, train_acc: 0.871
valid_loss: 0.455, valid_acc: 0.848
epoch: 10
train_loss: 0.398, train_acc: 0.880
valid_loss: 0.434, valid_acc: 0.856


In [49]:
model.load_state_dict(torch.load('nbow.pt'))

test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

print(f'test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}')

test_loss: 0.429, test_acc: 0.848


In [50]:
def predict_sentiment(text, model, tokenizer, vocab, device):
    tokens = tokenizer(text)
    ids = [vocab[t] for t in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

In [51]:
text = "This film is terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)

(0, 1.0)

In [52]:
text = "This film is great!"

predict_sentiment(text, model, tokenizer, vocab, device)

(1, 1.0)

In [53]:
text = "This film is not terrible, it's great!"

predict_sentiment(text, model, tokenizer, vocab, device)

(1, 0.8084420561790466)

In [54]:
text = "This film is not great, it's terrible!"

predict_sentiment(text, model, tokenizer, vocab, device)

(1, 0.8084420561790466)