In [None]:
import pandas as pd
import torch
import fastai

# The data
We will start working with a sample from the IMDB sentiment classification dataset. The goal for now is to understand the dataset and prepare it for working with neural nets.


In [None]:
data_path = fastai.untar_data(fastai.URLs.IMDB_SAMPLE)
data_path.ls()


Let's load and see the data:

In [None]:
dataset = pd.read_csv(data_path/'texts.csv')
dataset.head()

Each row corresponds to one example: the sentiment `label`, input `text` and a special field added by the fastai guys to split the dataset for training and validation.

# Preparing the data
Before feeding a neural net with text, we need to turn it into sequences of numbers.
This can be done in many ways: an integer per word, char, subword, etc. 
Let's keep it simple for now and turn words into integer ids

In [None]:
# Let's get the first example
raw_text = dataset['text'][0]
raw_text

In [None]:
# The simplest tokenization we can do is splitting by white-space
tokens = raw_text.split(' ')
tokens

Ok, we have a list of words for the text. In order to turn it into integer ids, we need to build a map word -> id and viceversa (id -> word). This is what is called a vocabulary, and is an essential component of any deep learning NLP pipeline. Let's build our first vocabulary (just for this text sample):

In [None]:
unique_tokens = set(tokens) # remove repeated tokens 
unique_tokens

In [None]:
word_to_id= {word: i for i, word in enumerate(tokens)} # turns words into ids
this_id = word_to_id['this']
this_id

In [None]:
id_to_word = {i: word for i, word in enumerate(tokens)} # turns ids into words
id_to_word[this_id]

Now we can use our 'vocab' to turn our text into a sequence of numbers:

In [None]:
numericalized_tokens = [word_to_id[w] for w in tokens]
numericalized_tokens

In [None]:
# Now let's build a vocab for the whole dataset
all_tokens = []

In [None]:
for text in dataset['text']:
    all_tokens.extend(text.split(' '))
all_tokens[100:110]

In [None]:
len(all_tokens)

In [None]:
unique_all_tokens = set(all_tokens)
# Get the size of our vocab
len(unique_all_tokens)

In [None]:
# Build the vocab
word_to_id = {word: i for i, word in enumerate(unique_all_tokens)}
id_to_word = {i: word for i, word in enumerate(unique_all_tokens)}
john_id = word_to_id['John']

In [None]:
id_to_word[john_id+1] # next word in the vocab

Now we have a really simple vocab for numericalizing our training/validation data.

# Turning words into vectors
We have now ids for every word. Almost every neural net for NLP uses this integer ids to get a vector for the word (or character, or..) in the first layer. This is what is know a the Embedding layer. The embedding layer is basically a lookup table of size Vxd, where V is the size of the vocab and d the dimension of the embedding vector. Let's see how this works:


In [None]:
import torch
from torch.nn.modules import Embedding
vocab_size = len(word_to_id)
emb_dim = 50 
embedding_layer = Embedding(vocab_size, emb_dim)
embedding_layer

In [None]:
# Let's get the vector for our first word
v_0 = embedding_layer(torch.tensor(0)) # The network only understands torch.tensor objects
v_0

In [None]:
# Now let's try with our first full example
tokens = dataset['text'][0].split(' ')
numericalized_example = [word_to_id[w] for w in tokens]
'text of length {} tokens'.format(len(numericalized_example)), numericalized_example, tokens

In [None]:
v_example = embedding_layer(torch.tensor(numericalized_example))
v_example # A matrix of the vectors corresponding to each of the 69 tokens

In [None]:
v_example[0] # the embedding vector of the first token

Good job! Now we have turn text into 'dense' real-valued vectors! 
Now, let's try to generalize this a little bit.

But first, let's try our vocab on text outside the IMDB sample dataset.


In [None]:
my_movie_review_text = 'Climax from Gaspar Noé is a shockingly beatiful movie'.split(' ')
numericalized_movie_review = [word_to_id[w] for w in my_movie_review_text]

## Out of vocabulary words
What happened?

'Climax' is what it's called an out of vocabulary word (or oov, unk..). This is an important thing to deal with when working with supervised learning for NLP, as our model is expected to work with text not seen during training, validation or test. The simplest way to deal with this is to add a special token to our vocabulary which will be assigned to every unknown word. 

But for this let's generalize a little bit our vocabulary functionality.



In [None]:
class Vocab:
    def __init__(self, unk_symbol='<unk>', is_label=False):
        self.size = 1
        self.word_to_id = {}
        self.id_to_word = {}
        # you will understand this later
        if not is_label:
            self.unk_symbol = unk_symbol
            self.unk_id = self.add_word(unk_symbol)
    def add_word(self, w):
        if w not in self.word_to_id:
            self.word_to_id[w] = self.size
            self.id_to_word[self.size] = w
            self.size += 1
        return self.size - 1
    def to_id(self, w):
       return self.word_to_id[w] if w in self.word_to_id else self.unk_id
    def to_word(self, id):
       return self.id_to_word[id] if id in self.id_to_word else self.unk_symbol  
    def __len__(self):
        return self.size
vocab = Vocab()
vocab.to_id('Climax')

In [None]:
len(vocab)

In [None]:
vocab.add_word('Climax')

In [None]:
# Now, let's try to build the vocab for the full dataset
full_vocab = Vocab()
for text in dataset['text']:
    for w in text.split(' '):
        full_vocab.add_word(w)
len(full_vocab) # We should get our previous lenght + 1 (for the unk token) = 36463
    

In [None]:
# Finally let's try on our previous unseen example
my_movie_review_text

In [None]:
numericalized_movie_review = [full_vocab.to_id(w) for w in my_movie_review_text]
numericalized_movie_review

In [None]:
numericalized_movie_review
[full_vocab.to_word(i) for i in numericalized_movie_review]

In [None]:
# We got three unknown words, one of them a misspeling of beautiful
# Lets try with the right spelling
my_movie_review_text = 'Climax from Gaspar Noé is a shockingly beautiful movie'.split(' ')
numericalized_movie_review = [full_vocab.to_id(w) for w in my_movie_review_text]
numericalized_movie_review

In [None]:
full_vocab.to_word(1535)

Now we have a working Vocab functionality with a simple tokenization mechanism (split by empty spaces), but what if we wanted a more general tokenizer with functions such as lowercasing, normalization, etc.?

Let's try to generalize this a little

In [None]:
class Tokenizer:
    def __init__(self, lowercase=False):
        self.lowercase = lowercase
    def __call__(self, text):
        return [w.lower() if self.lowercase else w for w in text.split(' ')]
my_tokenizer = Tokenizer(lowercase=True)
        

In [None]:
my_tokenizer('Climax is a horrible movie with nice music')

## Exercise 1
Now please build a new vocab by tokenizing the full dataset with lowercased words:

In [None]:
# Now, let's try to build the vocab for the full dataset
lowercased_vocab = Vocab()
for text in dataset['text']:
    # your code here
len(lowercased_vocab) # We should get a smaller vocab

## Exercise 2
We have been focusing on text, but what about labels? Labels are frequently also text, like in our case, where we have `positive`and `negative`. Neural nets don't understand text, so what should we do? We need to turn them into numbers. Good news is we can reuse our previous vocab to do this!

Please create the vocab for labels. In this case we do not want the vocab to contain an unkwnow label, so we will use the is_label parameter

In [None]:
labels_vocab = Vocab(is_label=True)
# Create the labels vocab

# labels_vocab.word_to_id # check the generated vocab

Congrats! We have now almost everything we need to start training our network!