In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 727kB 8.5MB/s 
[K     |████████████████████████████████| 204kB 43.8MB/s 
[K     |████████████████████████████████| 1.2MB 38.0MB/s 
[K     |████████████████████████████████| 51kB 8.0MB/s 
[K     |████████████████████████████████| 61kB 8.6MB/s 
[K     |████████████████████████████████| 51kB 7.5MB/s 
[?25hMounted at /content/gdrive


In [2]:
from fastbook import *
from IPython.display import display,HTML

## Tokenization

In [3]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [4]:
files = get_text_files(path, folders=['train', 'test', 'unsup'])

In [5]:
txt = files[0].open().read()
txt[:100]

'Although Stardust seems to be a fantasy film with predictable ending and average performances, it is'

In [6]:
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#280) ['Although','Stardust','seems','to','be','a','fantasy','film','with','predictable','ending','and','average','performances',',','it','is','certainly','not','.','When','i','saw','the','movie',',','i','knew','it','was'...]


In [7]:
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']

In [8]:
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt), 31))

(#297) ['xxbos','xxmaj','although','xxmaj','stardust','seems','to','be','a','fantasy','film','with','predictable','ending','and','average','performances',',','it','is','certainly','not','.','xxmaj','when','i','saw','the','movie',',','i'...]


In [9]:
txts = L(o.open().read() for o in files[:2000])

In [10]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [11]:
subword(1000)

'▁Al t ho ugh ▁St ard us t ▁seem s ▁to ▁be ▁a ▁fan t as y ▁film ▁with ▁pre d ic t able ▁end ing ▁and ▁a ver age ▁performance s , ▁it ▁is ▁certain ly ▁not . ▁When'

In [12]:
subword(200)

'▁A l th o u g h ▁S t ar d us t ▁ se e m s ▁to ▁be ▁a ▁f an t a s y ▁film ▁with ▁p re d ic t a b le ▁ en d'

In [13]:
subword(10000)

'▁Although ▁Stardust ▁seems ▁to ▁be ▁a ▁fantasy ▁film ▁with ▁predictable ▁ending ▁and ▁average ▁performances , ▁it ▁is ▁certain ly ▁not . ▁When ▁i ▁saw ▁the ▁movie , ▁i ▁knew ▁it ▁was ▁going ▁to ▁be ▁one ▁of ▁my ▁favorite ▁movies .'

## Numericalization

In [14]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#297) ['xxbos','xxmaj','although','xxmaj','stardust','seems','to','be','a','fantasy','film','with','predictable','ending','and','average','performances',',','it','is','certainly','not','.','xxmaj','when','i','saw','the','movie',',','i'...]


In [15]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#297) ['xxbos','xxmaj','although','xxmaj','stardust','seems','to','be','a','fantasy'...]

In [16]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab, 20)

"(#2168) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','and','a','of','to','is','in','it','i'...]"

In [17]:
nums = num(toks)[:20]
nums

TensorText([  2,   8, 265,   8, 879, 238,  15,  49,  13, 448,  29,  30,   0, 339,  12,   0, 217,  11,  18,  16])

2 is the index for `xxbos`, 8 is the index for `xxmaj` and so on. We can convert them back:

In [18]:
' '.join(num.vocab[o] for o in nums)

'xxbos xxmaj although xxmaj stardust seems to be a fantasy film with xxunk ending and xxunk performances , it is'

## Batch Sizes for Language Models

In [19]:
stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it for a while."
tokens = tkn(stream)
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,xxmaj,in,this,chapter,",",we,will,go,back,over,the,example,of,classifying
movie,reviews,we,studied,in,chapter,1,and,dig,deeper,under,the,surface,.,xxmaj
first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into,numbers,and
how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have,another,example
of,the,preprocessor,used,in,the,data,block,xxup,api,.,\n,xxmaj,then,we
will,study,how,we,build,a,language,model,and,train,it,for,a,while,.


In [20]:
nums200 = toks200.map(num)
dl = LMDataLoader(nums200)

In [21]:
x,y = first(dl)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [23]:
' '.join(num.vocab[o] for o in x[0])

"xxbos xxmaj although xxmaj stardust seems to be a fantasy film with xxunk ending and xxunk performances , it is certainly not . xxmaj when i saw the movie , i knew it was going to be one of my favorite movies . xxmaj and i was right . \n\n xxmaj stardust is more of a xxunk than an adventure film . xxmaj it has this magical ' xxunk ' from the"

In [24]:
' '.join(num.vocab[o] for o in y[0])

"xxmaj although xxmaj stardust seems to be a fantasy film with xxunk ending and xxunk performances , it is certainly not . xxmaj when i saw the movie , i knew it was going to be one of my favorite movies . xxmaj and i was right . \n\n xxmaj stardust is more of a xxunk than an adventure film . xxmaj it has this magical ' xxunk ' from the beginning"

## Training a text classifier

In [25]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)