In [None]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [None]:
from fastbook import *
from IPython.display import display,HTML

## NLP Deep Dive: RNNs
- language model is a model that has been trained to guess what the next word in a text is (having read the ones before)
- called self supervised learning because we do not need to give labels to our model, just fee it lots and lots of text
- Self supervised learning is not usually used for the model that is trained directly, but instead used for pretraining a model used for transfer learning
- better results can occur if you fine tune the sequence based langauge model prior to fine tuning the classification model
    - for instance for IMDB review sentiment analysis we can use 100,000 movie reviews to fine tune the pretrained model that was before trained on wikipedia articles. 
    - this will result in a language model that is particular good at predicting the next word of a movie review and keeping the style consistent. 

- Known as Universal Langauge Model Fine-tuning (ULMFit) appraoch
- an extra stage of fine tuning a language model prior to transfer learning to a classification task resulted in significantly better predictions
- three stages of transfer learning in NLP
    - Wikitext Language Model -> IMDb Language Model -> IMDb Classifier

## Text Preprocessing
- Approach we take for single categorical variable
    - make a list of all possible levels of that categorical variable (called vocab)
    - replace each level with its index in the vocab
    - create an embedding matrix for this containing a row for each level (each item of the vocab)
    - use this embedding matrix as the first layer of a neural net

- The same can be done with text
- Concatenate all documents in our dataset into a long string and split it into words (tokens)
- Our independent variable will be the sequence of words starting with the frist in our long list and ending with the second to last
- the dependent variable will be the sequence of words starting with the second word and ending with the last word
- vocab will consist of a mix of common words that are already in the vocabulary of the pretrained model and new words specific to our corpus (for imdb example actor names or cinematogrphic terms)
- For words in the vocab of our pretrained model we will take the corresponding row in the embedding matrix. New words will be initialized with a random vector

- steps necessary to create a language model
    - Tokenization (converting text into a list of words)
    - Numericalization (make a list of all unique words that appear (vocab), and convert each word into a number by looking up its index in the vocab
    - Language model data loader creation
        - Fastai provides a LMDataLoader class which automatically handles creating a dependent variable that is offset from the independent variable by one token.
        - also handles details such as how to shuffle the training data in a way that the dependent and independent variables maintain their structure as required
     - Language model creation
         - using a recurrent neural network

## Tokenization
- three main appraoches
    - word-based: split sentence on spaces while applying language speicifc rules to try to seperate parts of meaning when there are no spaces.
    - Subword based: split words into smaller parts based on the most commonly occuring substrings. "occasion" might be "o c ca sion".
    - Character-based: split sentence into individual characters

In [None]:
# Word tokenization with fastai
from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [None]:
# grab the text files
files = get_text_files(path, folders = ["train", "test", "unsup"])

In [None]:
txt = files[0].open().read(); txt[:75]

In [None]:
# Use WordTokenizer to create tokens
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

In [None]:
# fastai adds additional functionality to the tokenization process with the Tokenizer class
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt), 31))


In [None]:
# those starting with "xx" are special tokens
# xxbos indicates start of new text (beginning of stream)
# this lets the model know if it needs to forget what was said before (given it is the start of a new stream)

# xxmaj indicates the next word begins with a capital (we lowercased everything before)
# xxunk indicates the word is unknown

# see default rules
defaults.text_proc_rules

In [None]:
# Subword Tokenization
# assumption spaces provide a useful separation of components of meaning in a sentence
# best used in cases where languages do not have spaces (chinese) or use little spaces (hungarian)
# two steps
    # analyze corpus of documents to find the most commonly occuring groups of letters (these become vocab)
    # tokenize the corpus using this vocab ov subword units

In [None]:
# we instantiate our tokenizer passing in the size of the vocab
# we need to train it or have it read out docs to find common sequences
#txts = L(o.open().read() for o in files[:2000])

# remove bad character file
files.pop(58)
txts = L(open(o, encoding="utf8").read() for o in files[58:59])

# training is done with setup
# setup is a fastai method that is called automatically in our data processing pipelines
# we have to call it ourself since we are doing this manually
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

subword(1000)

In [None]:
# if a smaller vocab is used each token will represent fewer characters
subword(200)

In [None]:
# picking a subword size represents a compromise
# larger vocab means fewer tokens per sentence which means faster training, less memory and less state for the model to remember
# the downside is larger embedding matrices which require more data

# subword tokenization provides a way to easily scale between character tokenization and word tokenization
# last year has gotten more popular

In [None]:
# Numericalization with fastai
# mapping tokens to integers
    # make a list of all possible levels of that categorical variable (vocab)
    # replace each level with its index in the vocab
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

In [None]:
# we need to call setup to create the vocab
# we need our tokenized corpus first
# since tokenization takes a while this example will use a small subset
toks200 = txts[:200].map(tkn)
toks200[0]

In [None]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

In [None]:
# special rules tokens appear first, then every word appears once in frequency order
# defaults to Numericalize is min_freq=3, max_vocab=60000
# Once Numericalize is created we can use it as a function
nums = num(toks)[:20]
nums

In [None]:
# tokens have been converted to a tensor of integers that our model can recieve
# check that they map back to the original text
" ".join(num.vocab[o] for o in nums)

In [None]:
# now that we have numbers we need to put them in batches for our model

## Putting our Texts into Batches for a Language Model
- when dealing with images we need to resize them all to the same height and width before batching
- Here is different because we can't resize text
- We also need to be careful with order since we wan't our model to read text in order so it can predict the next word

In [None]:
# 90 tokens batched into 6 lengths of 15
stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it for a while."
tokens = tkn(stream)
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

In [None]:
# In a perfect world we could give this one batch to our model, but this doesn't scale
# Unlikely a single batch containing several million lines of text would fit into GPU memory
# we need to divide this array more finely into subarrays
    # the model we use will maintain a state so that it remembers what it read previously when predicting what comes next

# choose a sequence length of 5, we first feed the following array
bs, seq_len = 6, 5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))

In [None]:
# then this one
#hide_input
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

In [None]:
# finally
#hide_input
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

In [None]:
# the first step is to transform the individual texts into a stream by concatenating them together
# we then cut this stream into batches
# if we have 50,000 tokens and set a batch size to 10 we need 10 mini streams of 5000 tokens
# we must preserve the order of the tokens so the model reads continous rows of text

# all done with the fastai library LMDataLoader

In [None]:
# apply Numericalize object to the tokenized text
nums200 = toks200.map(num)

In [None]:
# pass to LMDataLoader
dl = LMDataLoader(nums200)

In [None]:
# check we can grab the first batch
x, y = first(dl)
x.shape, y.shape

In [None]:
# look at the first row of independent variable to see the start of the first text
" " .join(num.vocab[o] for o in x[0][:20])

In [None]:
# the dependent variable is the same thing offset by one token
" ".join(num.vocab[o] for o in y[0][:20])

## Training a Text Classifier
- two steps
    - fine tune our language model pretrainined on Wikipedia to the corpus of IMDB reviews
    - use that model to train a classifier

In [None]:
# Assemble data
# Language Model using DataBlock
# fastai handles tokenization and numericalization when TextBlock is passed to DataBlock

get_imdb = partial(get_text_files, folders=["train", "test", "unsup"])
dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

In [None]:
# we are not just using TextBlock directly but calling a class method
# TextBlock is special because it sets up the numericalizer's vocab that can take a long time
# It performs optimizations to save time such as 
    # saving tokenized dcouments in a temp folder, so it doesn't hav eto tokenize them more than once
    # runs multiple tokenization processes in parallel, to take advantage of a CPU

# We tell TextBlock how to access texts so it can do this preprocessing
# that's what from_folder does

In [None]:
# show batch
dls_lm.show_batch(max_n=2)

In [None]:
# Fine-Tuning the Language Model
# to convert the integer word indicies into activations for our neural network we use embeddings
# we feed those embeddings into a recurent neural network using an architecture called AWD-LSTM
# embedding in the pretrained model are merged with random embeddings added for words that weren't in the pretraining vocabulary
# this is handled automatically inside language_model_learner

In [None]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
# loss function used by default is cross-entropy loss since we have a classification problem
# perplexity metric used here is often used with NLP for language models
# it is the exponential of the loss torch.exp(cross_entropy)
# we also include accuracy to see how many times our model is right when trying to predict the next word

In [None]:
# going back to our process diagram we have completed the first step (pretrained model in fastai)
# we have built the dataloaders and learner for the second step of tuning to the corpus of IMDB

In [None]:
# training each epoch takes a while so we save the intermediate model results during the training process
# we use fit_one_cycle to do that for us
# language_model_learner  automatically calls freeze when using a pretrained model
# this will only train the embeddings (the only part of the model that contains randomly initialized weights ie embedding for words that are in our IMDB vocab, but aren't in the pretrained model vocab)

In [None]:
learn.fit_one_cycle(1, 2e-2)