In [None]:
import pandas as pd
import torch
import fastai

# Feeding data into the network
Modern neural nets work with what is called mini-batch gradient descent, which means that instead of feeding them one example at time, we group the examples into mini-batches. 

The goal now is to reuse what we've built so far to create an iterable Dataset which prepares the input data to the network.

Let's recap what we had

In [None]:
data_path = fastai.untar_data(fastai.URLs.IMDB_SAMPLE)
dataset = pd.read_csv(data_path/'texts.csv')
dataset.head()

In [None]:
class Tokenizer:
    def __init__(self, lowercase=False):
        self.lowercase = lowercase
    def __call__(self, text):
        return [w.lower() if self.lowercase else w for w in text.split(' ')]
    
class Vocab:
    def __init__(self, unk_symbol='<unk>', is_label=False):
        self.size = 0
        self.word_to_id = {}
        self.id_to_word = {}
        # you will understand this later
        if not is_label:
            self.unk_symbol = unk_symbol
            self.unk_id = self.add_word(unk_symbol)
    def add_word(self, w):
        if w not in self.word_to_id:
            self.word_to_id[w] = self.size
            self.id_to_word[self.size] = w
            self.size += 1
        return self.size - 1
    def to_id(self, w):
       return self.word_to_id[w] if w in self.word_to_id else self.unk_id
    def to_word(self, id):
       return self.id_to_word[id] if id in self.id_to_word else self.unk_symbol  
    def __len__(self):
        return self.size

## Our first nn dataset

Luckily, modern libraries provide simple ways to create suitable datasets. Let's just use the Dataset utility from PyTorch to create a CSVdataset, which is a simple abstract class with `__len__` and `__getitem__`

In [None]:
from torch.utils.data import Dataset, DataLoader

class CsvClassificationDataset:
    def __init__(self, csv_path, transform=None):
        self.df = pd.read_csv(csv_path)
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        example = { 
            'text': self.df.iloc[idx]['text'], 
            'label': self.df.iloc[idx]['label']}
        if self.transform:
            example = self.transform(example)
        return example['text'], example['label']
my_imdb_dataset = CsvClassificationDataset(data_path/'texts.csv')
len(my_imdb_dataset)

In [None]:
my_imdb_dataset[0]

Good job! Now we can reuse the DataLoader class offered by PyTorch, which provides out of the box functions for batching examples and shuffling the dataset (an essential aspect for SGD to work).

In [None]:
dataloader = DataLoader(my_imdb_dataset, batch_size=8, shuffle=True)

In [None]:
for i, example_batch in enumerate(dataloader):
    print(example_batch)

Now we have mini-batches for examples, but remember we need to turn everything into numbers and then tensors for our nn to understand the data.
Let's do it

In [None]:
# We first create our text and label vocabs.
text_tokenizer = Tokenizer(lowercase=True)
def make_vocab(fields, is_label=False):
    vocab = Vocab(is_label=is_label)
    for t in fields:
        for w in text_tokenizer(t):
            vocab.add_word(w)
    return vocab
text_vocab = make_vocab(dataset['text'])
label_vocab = make_vocab(dataset['label'], is_label=True)
label_vocab.word_to_id

In [None]:
# Now all we have left is to create our data transformation pipeline
# which is tokenize --> numericalize (using vocab) --> to tensor
class Tokenize:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, sample):
        return {'text': self.tokenizer(sample['text']),
                'label': self.tokenizer(sample['label'])}

class Numericalize:
    def __init__(self, text_vocab, label_vocab):
        self.text_vocab = text_vocab
        self.label_vocab = label_vocab
    def _numericalize(self, toks, vocab):
        return [vocab.to_id(w) for w in toks]
    def __call__(self, sample):
        return {'text': torch.tensor(self._numericalize(sample['text'], self.text_vocab)),
                'label': torch.tensor(self._numericalize(sample['label'], self.label_vocab)[0])}


In [None]:
from torchvision import transforms
transform = transforms.Compose([Tokenize(text_tokenizer),
                                Numericalize(text_vocab, label_vocab)])

In [None]:
imdb_dataset = CsvClassificationDataset(data_path/'texts.csv', transform=transform)

In [None]:
imdb_dataset[3]

## Texts vary in length!
We are almost done. Except one thing: movie reviews and almost any other text have variable lenght! This is a problem for neural networks, which expect inputs (and batches) to be of a certain shape. This is another feature which makes processing texts with neural networks different to other type of inputs such as images.

Without going to much into the details, we need to do what is call 'padding' which basically adds a padding token to the end of the text to reach a certain length.

In [None]:
def pad_tensor(vec, pad, dim):
        pad_size = list(vec.shape)
        pad_size[dim] = pad - vec.size(dim)
        return torch.cat([vec, torch.zeros(*pad_size, dtype=torch.int64)], dim=dim)
class PadCollate:
    def __init__(self, dim):
        self.dim = dim 
    def pad_collate(self, batch):
        max_len = max(map(lambda x: x[0].shape[self.dim], batch))
        batch = list(map(lambda b:
                    (pad_tensor(b[0], pad=max_len, dim=self.dim), b[1]), batch))
        xs = torch.stack(list(map(lambda x: x[0], batch)), dim=0)
        ys = torch.LongTensor(list(map(lambda x: x[1], batch)))
        return xs, ys
    def __call__(self, batch):
        return self.pad_collate(batch)

In [None]:
dataloader = DataLoader(imdb_dataset, batch_size=16, shuffle=True, collate_fn=PadCollate(dim=0))
for i, example in enumerate(dataloader):
    texts, labels = example


# Our first neural network

Finally. We are ready to define and train our first neural network on text. Let's keep it as simple as possible:

In [None]:
# let's first import everything we need
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class IMDBClassifier(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, n_labels):
        super(IMDBClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.linear = nn.Linear(emb_size, hidden_size)
        self.out = nn.Linear(hidden_size, n_labels)
    def forward(self, texts):
        embeddings = self.embedding(texts)
        # Our input has shape `(batch_size, num_tokens, embedding_dim)`, so we sum out the `num_tokens`
        # dimension.
        summed = embeddings.sum(1)
        out = self.linear(summed)
        return self.out(out)


In [None]:
model = IMDBClassifier(len(text_vocab), 50, 100, len(label_vocab))
losses = []
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
for epoch in range(10):
    total_loss = 0
    for i, example in enumerate(dataloader):
        texts, labels = example
        model.zero_grad()
        out = model(texts)
        loss = loss_function(out, labels)
    
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print("Epoch: {}.Â Loss: {}".format(epoch, total_loss))
    losses.append(total_loss)
print(losses)