In [1]:
from fastai.text.all import *
import pandas as pd

In [6]:
path = untar_data(URLs.IMDB)

https://docs.fast.ai/text.core.html

In [7]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [9]:
txt = files[0].open().read()
txt[:75]

"It's about jealousy, it's about racism, it's about manipulation, but the un"

In [11]:
txts = L(o.open().read() for o in files[:200])
txts

(#200) ['It\'s about jealousy, it\'s about racism, it\'s about manipulation, but the underlying message is love. Geoffrey Sax tried to pull off Shakespeare\'s Othello, by bringing it to modern day context. However, the actors were not convincing enough to pull this off. There were extra bodies to help put everything in to perspective, however, John Othello, played by Eamonn Walker, over reacted a lot in this film, causing for the down fall of Keeley Hawes, Dessie Brabant, eventually ending in Dessie\'s death. <br /><br />Ben Jago, played by Christopher Eccleston, was seen as the main character in the film. He didn\'t give enough evidence for Dessie to be cheating on Othello, with Michael Cass, played by Richard Coyle. Instead he just played a friend to all and gave one reason as to why she "was" cheating. In the play, it took a lot more convincing from Iago to make Othello even suspect anything. This change made the movie more about rage for the wrong reasons, than what the book was ba

## Subword Tokenization

In [12]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [16]:
subword(10000)

"▁It ' s ▁about ▁jea lousy , ▁it ' s ▁about ▁racism , ▁it ' s ▁about ▁ manipulati on , ▁but ▁the ▁under ly ing ▁message ▁is ▁love . ▁Ge off re y ▁S ax ▁tri ed ▁to ▁pull"

## Numericalization

In [17]:
spacy = WordTokenizer()
tkn = Tokenizer(spacy)



In [36]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#530) ['xxbos','xxmaj','it',"'s",'about','jealousy',',','it',"'s",'about'...]

In [23]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#1920) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','and','a','to','of','is','i','it','in'...]"

In [25]:
toks = tkn(txt)

In [26]:
nums = num(toks)[:20]
nums

TensorText([   2,    8,   18,   25,   66,    0,   11,   18,   25,   66, 1416,
              11,   18,   25,   66,    0,   11,   31,    9,    0])

In [29]:
' '.join(num.vocab[o] for o in nums)

"xxbos xxmaj it 's about xxunk , it 's about racism , it 's about xxunk , but the xxunk"

In [33]:
toks[:20]

(#20) ['xxbos','xxmaj','it',"'s",'about','jealousy',',','it',"'s",'about'...]

## Batches

In [35]:
nums200 = toks200.map(num)

dl = LMDataLoader(nums200)

x,y = first(dl)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [40]:
' '.join(num.vocab[o] for o in x[0][:20])

"xxbos xxmaj it 's about xxunk , it 's about racism , it 's about xxunk , but the xxunk"

In [41]:
' '.join(num.vocab[o] for o in y[0][:20])

"xxmaj it 's about xxunk , it 's about racism , it 's about xxunk , but the xxunk message"

## FastAI Transforms

In [43]:
files = get_text_files(path, folders = ['train','test'])
txts = L(o.open().read() for o in files[:2000])

In [44]:
tok = Tokenizer.from_folder(path)
tok.setup(txts)
toks = txts.map(tok)
toks[0]

(#530) ['xxbos','xxmaj','it',"'s",'about','jealousy',',','it',"'s",'about'...]

In [45]:
num = Numericalize()
num.setup(toks)
nums = toks.map(num)
nums[0][:10]

TensorText([   2,    8,   18,   23,   62, 6709,   11,   18,   23,   62])

In [46]:
nums_dec = num.decode(nums[0][:10])
nums_dec

(#10) ['xxbos','xxmaj','it',"'s",'about','jealousy',',','it',"'s",'about']

In [47]:
tok.decode(nums_dec)

"xxbos xxmaj it 's about jealousy , it 's about"

In [48]:
tok((txts[0], txt[1]))

((#530) ['xxbos','xxmaj','it',"'s",'about','jealousy',',','it',"'s",'about'...],
 (#2) ['xxbos','t'])

In [50]:
def f(x:int): return x+1
tfm = Transform(f)
tfm(2),tfm(2.0)

(3, 2.0)

In [51]:
@Transform
def f(x:int): return x+1
f(2),f(2.0)

(3, 2.0)

In [None]:
class NormalizeMean(Transform):
    def setups(self, items): self.mean = sum(items)/len(items)
    def encodes(self, x): return x-self.mean
    def encodes(self, x): return x+self.mean