In [1]:
import fastbook
fastbook.setup_book()

In [2]:
from fastbook import *

In [3]:
from fastai.text.all import *

# **Text Preprocessing**

# Tokenization

### Word Tokenization with FastAI

In [4]:
path = untar_data(URLs.IMDB)

NOTE: `get_text_files` gets all the text files in a path. We can also optionally pass `folders` to restrict the search to a particular list of subfolders:

In [5]:
path.ls()

(#7) [Path('/root/.fastai/data/imdb/imdb.vocab'),Path('/root/.fastai/data/imdb/tmp_lm'),Path('/root/.fastai/data/imdb/train'),Path('/root/.fastai/data/imdb/unsup'),Path('/root/.fastai/data/imdb/tmp_clas'),Path('/root/.fastai/data/imdb/test'),Path('/root/.fastai/data/imdb/README')]

In [5]:
files = get_text_files(path, folders = ["train", "test", "unsup"])

In [7]:
txt = files[0].open().read()

txt[:75]

"Jack London's life was certainly colorful enough for a dozen films about di"

In [8]:
spacy = WordTokenizer()
toks = first(spacy([txt])) # first() allows you to actually generate the tokens
print(coll_repr(toks, 30)) #Print the collection

(#702) ['Jack','London',"'s",'life','was','certainly','colorful','enough','for','a','dozen','films','about','different','aspects','of','him','.','Sad','to','say','though','that','what','his','life','was','used','for','in'...]


In [9]:
spacy(['The U.S. dollar $1 is $1.00.'])

<generator object SpacyTokenizer.__call__.<locals>.<genexpr> at 0x7fb320927ac0>

In [10]:
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']

In [11]:
tkn = Tokenizer(spacy)

In [12]:
print(coll_repr(tkn(txt), 31))

(#804) ['xxbos','xxmaj','jack','xxmaj','london',"'s",'life','was','certainly','colorful','enough','for','a','dozen','films','about','different','aspects','of','him','.','xxmaj','sad','to','say','though','that','what','his','life','was'...]


In [13]:
coll_repr(tkn('©   Fast.ai www.fast.ai/INDEX'))

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup'...]"

### Subword Tokenization with FastAI

In [14]:
files[0]

Path('/root/.fastai/data/imdb/train/neg/5625_4.txt')

In [15]:
txts = L(o.open().read() for o in files[:2000])

We instantiate our tokenizer, passing in the size of the vocab we want to create, and then we need to "train" it. That is, we need to have it read our documents and find the common sequences of characters to create the vocab. This is done with `setup`. As we'll see shortly, `setup` is a special fastai method that is called automatically in our usual data processing pipelines. Since we're doing everything manually at the moment, however, we have to call it ourselves. Here's a function that does these steps for a given vocab size, and shows an example output:

In [16]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [17]:
subword(500)

"▁J ack ▁L on d on ' s ▁life ▁was ▁c er t ain ly ▁co l or ful ▁enough ▁for ▁a ▁do z en ▁film s ▁about ▁d i ff er ent ▁as pe c t s ▁of ▁him"

In [18]:
subword(1000)

"▁J ack ▁Lo n d on ' s ▁life ▁was ▁certain ly ▁co l or ful ▁enough ▁for ▁a ▁do z en ▁films ▁about ▁different ▁a spect s ▁of ▁him . ▁S a d ▁to ▁say ▁though ▁that ▁what ▁his"

In [19]:
subword(2000)

"▁Jack ▁London ' s ▁life ▁was ▁certainly ▁color ful ▁enough ▁for ▁a ▁dozen ▁films ▁about ▁different ▁a spect s ▁of ▁him . ▁Sa d ▁to ▁say ▁though ▁that ▁what ▁his ▁life ▁was ▁used ▁for ▁in ▁film ▁was ▁some ▁war time"

In [20]:
subword(10000)

"▁Jack ▁London ' s ▁life ▁was ▁certainly ▁colorful ▁enough ▁for ▁a ▁dozen ▁films ▁about ▁different ▁aspects ▁of ▁him . ▁Sa d ▁to ▁say ▁though ▁that ▁what ▁his ▁life ▁was ▁used ▁for ▁in ▁film ▁was ▁some ▁war time ▁propaganda ▁that ▁put"

# Numericalization

In [21]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#804) ['xxbos','xxmaj','jack','xxmaj','london',"'s",'life','was','certainly','colorful','enough','for','a','dozen','films','about','different','aspects','of','him','.','xxmaj','sad','to','say','though','that','what','his','life','was'...]


Just like with `SubwordTokenizer`, we need to call `setup` on `Numericalize`; this is how we create the vocab. That means we'll need our tokenized corpus first. Since tokenization takes a while, it's done in parallel by fastai; but for this manual walkthrough, we'll use a small subset:

In [22]:
toks200  = txts[:200].map(tkn)
toks200[0]

(#804) ['xxbos','xxmaj','jack','xxmaj','london',"'s",'life','was','certainly','colorful'...]

We can pass this to setup to create our vocab:

In [23]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab, 20)

"(#2016) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','i','it','in'...]"

Once we've created our `Numericalize` object, we can use it as if it were a function:

In [24]:
toks200[0]

(#804) ['xxbos','xxmaj','jack','xxmaj','london',"'s",'life','was','certainly','colorful'...]

In [25]:
nums = num(toks)[:20]
nums

TensorText([   2,    8,  597,    8,  367,   24,  145,   25,  368, 1477,  203,   28,   12, 1206,  132,   58,    0,  598,   14,  109])

In [26]:
nums = num(toks200[0])[:40]
nums

TensorText([   2,    8,  597,    8,  367,   24,  145,   25,  368, 1477,  203,   28,   12, 1206,  132,   58,    0,  598,   14,  109,   10,    8,  670,   15,  153,  146,   22,   63,   45,  145,   25,  505,
          28,   19,   34,   25,   70,    0, 1009,   22])

In [27]:
' '.join(num.vocab[o] for o in nums)

"xxbos xxmaj jack xxmaj london 's life was certainly colorful enough for a dozen films about xxunk aspects of him . xxmaj sad to say though that what his life was used for in film was some xxunk propaganda that"

# Putting Our Texts into Batches for a Language Model


In [28]:
stream = """ 
In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it for a while.
"""

In [29]:
tokens = tkn(stream)
bs, seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len]  for i in range(bs)])
df = pd.DataFrame(d_tokens)
# display(df.to_html(index=False, header=None))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,xxbos,xxmaj,in,this,chapter,",",we,will,go,back,over,the,example,of,classifying
1,movie,reviews,we,studied,in,chapter,1,and,dig,deeper,under,the,surface,.,xxmaj
2,first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into,numbers,and
3,how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have,another,example
4,of,the,preprocessor,used,in,the,data,block,xxup,api,.,\n,xxmaj,then,we
5,will,study,how,we,build,a,language,model,and,train,it,for,a,while,.


In a perfect world, we could then give this one batch to our model. But that approach doesn't scale, because outside of this toy example it's unlikely that a single batch containing all the texts would fit in our GPU memory (here we have 90 tokens, but all the IMDb reviews together give several million).


So, we need to divide this array more finely into subarrays of a fixed sequence length. It is important to maintain order within and across these subarrays, because we will use a model that maintains a state so that it remembers what it read previously when predicting what comes next.


Going back to our previous example with 6 batches of length 15, if we chose a sequence length of 5, that would mean we first feed the following array:

In [30]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
# display(HTML(df.to_html(index=False,header=None)))
df

Unnamed: 0,0,1,2,3,4
0,xxbos,xxmaj,in,this,chapter
1,movie,reviews,we,studied,in
2,first,we,will,look,at
3,how,to,customize,it,.
4,of,the,preprocessor,used,in
5,will,study,how,we,build


Then this

In [31]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
# display(HTML(df.to_html(index=False,header=None)))
df

Unnamed: 0,0,1,2,3,4
0,",",we,will,go,back
1,chapter,1,and,dig,deeper
2,the,processing,steps,necessary,to
3,xxmaj,by,doing,this,","
4,the,data,block,xxup,api
5,a,language,model,and,train


Finally, this:

In [32]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])
df = pd.DataFrame(d_tokens)
# display(HTML(df.to_html(index=False,header=None)))
df

Unnamed: 0,0,1,2,3,4
0,over,the,example,of,classifying
1,under,the,surface,.,xxmaj
2,convert,text,into,numbers,and
3,we,'ll,have,another,example
4,.,\n,xxmaj,then,we
5,it,for,a,while,.


This is all done behind the scenes by the fastai library when we create an `LMDataLoader`. We do this by first applying our `Numericalize` object to the tokenized texts:

In [33]:
nums200 = toks200.map(num)

and then passing that to `LMDataLoader`:

In [34]:
dl = LMDataLoader(nums200)

Grab the first batch

In [35]:
x,y = first(dl)
x.shape, y.shape

((64, 72), (64, 72))

looking at the first row of the independent variable,

In [36]:
' '.join(num.vocab[o] for o in x[0][:40])

"xxbos xxmaj jack xxmaj london 's life was certainly colorful enough for a dozen films about xxunk aspects of him . xxmaj sad to say though that what his life was used for in film was some xxunk propaganda that"

The dependent variable is the same thing offset by one token:

In [37]:
' '.join(num.vocab[o] for o in y[0][:40])

"xxmaj jack xxmaj london 's life was certainly colorful enough for a dozen films about xxunk aspects of him . xxmaj sad to say though that what his life was used for in film was some xxunk propaganda that put"

**This concludes all the preprocessing steps we need to apply to our data. We are now ready to train our text classifier.**

# Training a Text Classifier

## Language Model Using DataBlock

In [6]:
get_imdb = partial(get_text_files, folders = ['train', 'test', 'unsup'])

In [7]:
dls_lm = DataBlock(
    blocks = TextBlock.from_folder(path, is_lm=True),
    get_items = get_imdb, splitter = RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

In [40]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj i 've just revisited this fondly remembered bit of cinematic madness from my early days , and must urge you to beg steal or borrow it . \n\n xxmaj the story begins with a duel between a righteous xxmaj shaolin priest and our villain xxmaj abbot xxmaj white , needless to say , xxmaj abbot xxmaj white kicks xxmaj buddhist ass , and wages his campaign against xxmaj shaolin unhindered with the aid of his new ninja allies","xxmaj i 've just revisited this fondly remembered bit of cinematic madness from my early days , and must urge you to beg steal or borrow it . \n\n xxmaj the story begins with a duel between a righteous xxmaj shaolin priest and our villain xxmaj abbot xxmaj white , needless to say , xxmaj abbot xxmaj white kicks xxmaj buddhist ass , and wages his campaign against xxmaj shaolin unhindered with the aid of his new ninja allies ("
1,"xxmaj after a few times , i get bored and changed the channel . i still love xxmaj robert xxmaj culp and xxmaj patricia xxmaj crowley and xxmaj ray xxmaj milland in their roles but the story was weaker in this episode than in the others . xxmaj first , xxmaj robert xxmaj culp plays an investigator for xxmaj ray xxmaj milland 's character . xxmaj he hires him to investigate his young pretty wife played by xxmaj patricia xxmaj","after a few times , i get bored and changed the channel . i still love xxmaj robert xxmaj culp and xxmaj patricia xxmaj crowley and xxmaj ray xxmaj milland in their roles but the story was weaker in this episode than in the others . xxmaj first , xxmaj robert xxmaj culp plays an investigator for xxmaj ray xxmaj milland 's character . xxmaj he hires him to investigate his young pretty wife played by xxmaj patricia xxmaj crowley"


**Now that our data is ready, we can fine-tune the pretrained language model**

## Fine-Tuning the Language Model

In [8]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3,
    metrics = [accuracy, Perplexity()]
).to_fp16()

In [9]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.004746,3.90099,0.30072,49.451393,34:16


**Save the Model**

In [43]:
learn.save('1epoch')

Path('/root/.fastai/data/imdb/models/1epoch.pth')

**Load the Model**

In [11]:
# learn = learn.load('1epoch')

**Continue Training**

In [10]:
learn.unfreeze()

In [11]:
learn.fit_one_cycle(5,2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.766863,3.752828,0.317781,42.641506,36:19
1,3.669888,3.663725,0.327585,39.006378,36:12
2,3.533029,3.604167,0.335046,36.751049,36:16
3,3.452684,3.572528,0.339135,35.606495,36:13
4,3.371913,3.57268,0.339655,35.611897,36:14


In [12]:
learn.export('/notebooks/FASTAI_2022/review_generator.pkl')

In [13]:
learn.save_encoder('/notebooks/FASTAI_2022/finetuned')

# **Text Generation**

In [14]:
TEXT = "I like this movie because"
N_WORDS = 50
N_SENTENCES = 2
predicts = [learn.predict(TEXT, N_WORDS, temperature = 0.75) for _ in range(N_SENTENCES)]

In [15]:
print("\n".join(predicts))

i like this movie because it has the ability to make you feel clever and clever . The way the characters are portrayed is clever and i can not help but feel they are in one movie . i also like that this movie is also a movie that makes you feel good .
i like this movie because it has a little bit of a twist , but it 's a very good movie . I 'm a big fan of the Jay Leno & Letterman show . i am really impressed . i watch every show on TV & the networks keep


# **Creating Text Classifier DataLoaders**

In [17]:
dls_class = DataBlock(
    blocks = (TextBlock.from_folder(path, vocab=dls_lm.vocab), CategoryBlock),
    get_y = parent_label,
    get_items = partial(get_text_files, folders=['train','test']),
    splitter = GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

In [18]:
dls_class.show_batch(max_n=4)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,"xxbos xxmaj some have praised _ xxunk _ as a xxmaj disney adventure for adults . i do n't think so -- at least not for thinking adults . \n\n xxmaj this script suggests a beginning as a live - action movie , that struck someone as the type of crap you can not sell to adults anymore . xxmaj the "" crack staff "" of many older adventure movies has been done well before , ( think _ the xxmaj dirty xxmaj dozen _ ) but _ atlantis _ represents one of the worse films in that motif . xxmaj the characters are weak . xxmaj even the background that each member trots out seems stock and awkward at best . xxmaj an xxup md / xxmaj medicine xxmaj man , a tomboy mechanic whose father always wanted sons , if we have not at least seen these before",neg
2,"xxbos xxmaj here are the matches . . . ( adv . = advantage ) \n\n xxmaj the xxmaj warriors ( ultimate xxmaj warrior , xxmaj texas xxmaj tornado and xxmaj legion of xxmaj doom ) v xxmaj the xxmaj perfect xxmaj team ( mr xxmaj perfect , xxmaj ax , xxmaj smash and xxmaj crush of xxmaj demolition ) : xxmaj ax is the first to go in seconds when xxmaj warrior splashes him for the pin ( 4 - 3 adv . xxmaj warriors ) . i knew xxmaj ax was n't a healthy man but if he was that unhealthy why bother have him on the card ? xxmaj this would be his last xxup ppv . xxmaj eventually , both xxmaj legion of xxmaj doom and xxmaj demolition job out cheaply via double disqualification ( 2 - 1 adv . xxmaj warriors ) . xxmaj perfect",neg
3,"xxbos xxmaj in xxup nyc , seaman xxmaj michael o'hara ( orson xxmaj welles ) rescues xxmaj elsa xxmaj bannister ( rita xxmaj hayworth ) from a mugging & rape as she takes a horse & carriage through xxmaj central xxmaj park -and lives to regret it . xxmaj xxunk - haired xxmaj hayworth 's a platinum blonde in this one ; as dazzling as fresh - fallen snow -but nowhere near as pure … \n\n xxmaj to reveal any more of the convoluted plot in this seminal "" noir "" would be criminal . xxmaj it 's as deceptive as the mirrors used to cataclysmic effect in the final scenes -but the film holds far darker secrets : xxmaj from the xxup ny xxmaj times : "" childhood xxmaj shadows : xxmaj the xxmaj hidden xxmaj story xxmaj of xxmaj the xxmaj black xxmaj dahlia xxmaj murder "" by",pos


## Create the model to classify texts

In [19]:
learn = text_classifier_learner(dls_class, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()

**Load pretrained language model**

In [20]:
learn = learn.load_encoder('/notebooks/FASTAI_2022/finetuned')

# Fine-tuning the classifier

In [21]:
learn.fit_one_cycle(1,2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.25374,0.191254,0.92648,01:59


In [23]:
learn.freeze_to(-2)

In [24]:
learn.fit_one_cycle(1, slice(1e-2/(2.6**4), 1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.227656,0.169693,0.93488,02:13


In [25]:
learn.freeze_to(-4)

In [26]:
learn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.187208,0.153017,0.9438,03:41


In [27]:
learn.unfreeze()

In [28]:
learn.fit_one_cycle(2, slice(1e-3/(2.6**4), 1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.151935,0.15077,0.94492,03:45
1,0.133359,0.154359,0.9446,03:45


In [29]:
learn.export('/notebooks/FASTAI_2022/review_classifier.pkl')