# Check different tokenizer in fastai library (word, subword ...)

In [1]:
from fastai2.text.all import *

In [2]:
txt = 'This movie, which I just discovered at the video store, has apparently sit '

In [3]:
spacy = WordTokenizer()

In [4]:
next(iter(spacy([txt])))

(#15) ['This','movie',',','which','I','just','discovered','at','the','video'...]

In [5]:
toks = first(spacy([txt]))
toks

(#15) ['This','movie',',','which','I','just','discovered','at','the','video'...]

In [12]:
path = untar_data(URLs.IMDB)

In [13]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [14]:
txts = L(o.open().read() for o in files[:2000])

In [18]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return first(sp([txt]))[:40]

In [19]:
txt= 'discover discovering discovered disco disc disk discord disconnect disconnected disconneting'

In [20]:
print(subword(1000))

['▁dis', 'co', 'ver', '▁dis', 'co', 'ver', 'ing', '▁dis', 'co', 'ver', 'ed', '▁dis', 'co', '▁dis', 'c', '▁dis', 'k', '▁dis', 'c', 'or', 'd', '▁dis', 'c', 'on', 'n', 'ect', '▁dis', 'c', 'on', 'n', 'ect', 'ed', '▁dis', 'c', 'on', 'ne', 'ting']


In [21]:
print(subword(10000))

['▁discover', '▁discover', 'ing', '▁discovered', '▁disco', '▁disc', '▁dis', 'k', '▁disco', 'rd', '▁disco', 'n', 'n', 'ect', '▁disco', 'n', 'n', 'ect', 'ed', '▁disco', 'nne', 'ting']


In [22]:
tkn = Tokenizer(spacy)
tkn(txt)

(#11) ['xxbos','discover','discovering','discovered','disco','disc','disk','discord','disconnect','disconnected'...]

In [None]:
type(tkn)

In [27]:
type(spacy),type(SubwordTokenizer(vocab_sz=100)) 
# note that these 2 are not inherited from Transform class so __call__ is very different

(fastai2.text.core.SpacyTokenizer, fastai2.text.core.SentencePieceTokenizer)

# Check understanding of gradient computation graph and requires_grad

![](require_grad_1.png)

In [174]:
x=torch.tensor([2.]*10, requires_grad=True)
y=x**2
z=0
z=z+x**3
z = z*2
print(z.requires_grad)
r=(y+z).sum()
r

True


tensor(200., grad_fn=<SumBackward0>)

In [175]:
r.backward()
x.grad

tensor([28., 28., 28., 28., 28., 28., 28., 28., 28., 28.])

In [177]:
x=torch.tensor([2.]*10, requires_grad=True)
y=x**2
z=0
z=z+x**3
print(z.requires_grad)
z = z.detach() # from now on, everything being done to z won't be backprop back in z direction
z = z*2 # thus this is not counted toward backprop
print(z.requires_grad,x.requires_grad)
r=(y+z).sum()
r

True
False True


tensor(200., grad_fn=<SumBackward0>)

In [178]:
r.backward()
x.grad # only the y path is backprop, not the z path

tensor([4., 4., 4., 4., 4., 4., 4., 4., 4., 4.])

## Another example, adding another tensor with requires_grad=True

In [183]:
x=torch.tensor([2.]*10, requires_grad=True)
y=x**2
z=0
z=z+x**3
# z = z.detach()

two = torch.tensor([2.]*10,requires_grad=True)
z = z*two

r=(y+z).sum()
r

tensor(200., grad_fn=<SumBackward0>)

In [184]:
r.backward()
x.grad,two.grad

(tensor([28., 28., 28., 28., 28., 28., 28., 28., 28., 28.]),
 tensor([8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]))

With detach

![](require_grad_2.png)

In [20]:
x=torch.tensor([2.]*10, requires_grad=True)
y=x**2
z=0
z=z+x**3
z = z.detach() # from now on, everything being done to z won't be backprop back in z direction
print(z.requires_grad)
two = torch.tensor([2.]*10,requires_grad=True)
z = z*two # z grad is turned back on to true here, so two's grad will be calculated
# note that x's grad still won't be calculated
print(z.requires_grad)
r=(y+z).sum()
r

False
True


tensor(200., grad_fn=<SumBackward0>)

In [19]:
r.backward()
x.grad,two.grad

(tensor([4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]),
 tensor([8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]))

In [189]:
x=torch.tensor([2.]*10, requires_grad=True)
y=x**2
z=0
z=z+x**3
two = torch.tensor([2.]*10,requires_grad=True)
z = z*two
r=(y+z).sum()
print(z.requires_grad)
z = z.detach() # from now on, everything being done to z won't be backprop back in z direction
# so before this, any gradient dependent of z still be calculated (x and two)
print(z.requires_grad)
r

True
False


tensor(200., grad_fn=<SumBackward0>)

In [190]:
r.backward()
x.grad,two.grad

(tensor([28., 28., 28., 28., 28., 28., 28., 28., 28., 28.]),
 tensor([8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]))

# Explaning truncated bptt and LMModel3 requires_grad in forward()

In [1]:
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)

## Preparing data

In [2]:
Path.BASE_PATH = path
path.ls()

(#2) [Path('train.txt'),Path('valid.txt')]

In [3]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [4]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [5]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [6]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [7]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [8]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [9]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

In [10]:
m = len(seqs)//bs
m,bs,len(seqs)

(328, 64, 21031)

In [11]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs), 
    group_chunks(seqs[cut:], bs), 
    bs=bs, drop_last=True, shuffle=False)

## Model

In [12]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        print('---Forward is called---')
        if not isinstance(self.h, int):
            print(self.h.requires_grad) # false, as we detach it from last 'forward'
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i]) # this operation will always turn self.h requires_grad on (see above)
            # so everything we do with h from here will be backprop, including self.ih, self.h_h and self.h_o
            if i==0: print(self.h.requires_grad)
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach() # from now on every grad related to h won't be backprop
        # (except if doing an operation between h and another grad-able tensor, see right above)
        print(self.h.requires_grad) 
        return out
    
    def reset(self): 
        print('---Reset is called---') # only called once before training,once before validation and once at the end
        self.h = 0

Back propagation through time (BPTT): Treating a neural net with effectively one layer per time step (usually refactored using a loop) as one big model, and calculating gradients on it in the usual way. To avoid running out of memory and time, we usually use truncated BPTT, which "detaches" the history of computation steps in the hidden state every few time steps.

In [22]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,
                metrics=accuracy, cbs=ModelResetter)

---Reset is called---


In [24]:
# learn.fit_one_cycle(1, 3e-3)