# Prelims

In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
# from fastai import *
from fastai.text import *
from fastai.callbacks.tracker import *
import pdb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Data

## WikiText

In [None]:
wiki_path = Path('data/wikitext/wikitext-2-raw')
# wiki_path = Path('data/wikitext/wikitext-103-raw')

In [None]:
with open(wiki_path/'wiki.train.raw') as file:  
    trn = file.read()
with open(wiki_path/'wiki.valid.raw') as file:  
    val = file.read()
with open(wiki_path/'wiki.test.raw') as file:  
    tst = file.read()

In [None]:
len(trn)
# 2:    10918892
# 103: 539566975

### clean text and save CSV

In [None]:
def remove_newline_spaces(x):
    x = x.replace(' \n','\n').replace('\n ','\n')
    return re.sub(r'(\n)+','\n',x)

def remove_equals(x):
    return x.replace(' =', '').replace('= ', '')

# convert spaced out " strings " to "strings"
def despace_quotes(m):
    m = m.group(0)   # entire matched string
    return m.replace('" ','"').replace(' "','"')

def cleanup(x):
    x = fix_html(x)
    x = remove_newline_spaces(x)
    x = remove_equals(x)
    x = x.replace( " \'", "\'").replace(' ,', ',').replace(' .', '.').replace(' :', ':').replace(
        ' ;', ';').replace('( ', '(').replace(' )', ')').replace('[ ', '[').replace(' ]', ']')
    x = re.sub(r'\"(.+?)\"', despace_quotes, x)
    x = re.sub(r'[^\x00-\x7F]+','', x)   # remove all non ascii characters
    return x

In [None]:
full = trn + val + tst
# len(trn)

In [None]:
full = cleanup(full)

In [None]:
full[0:5000]

In [None]:
lines = full.split('\n')
len(lines)

In [None]:
import textwrap

In [None]:
text = [textwrap.wrap(line, 1000) for line in lines]

In [None]:
len(text)

In [None]:
flat_list = []
for sublist in text:
    for item in sublist:
        flat_list.append(item)

In [None]:
len(flat_list)

In [None]:
df = pd.DataFrame({'text': flat_list})

In [None]:
# remove small lines
df['char_len'] = df.text.map(len)
df = df[df['char_len'] > 15]
df.sort_values('char_len', inplace=True)
df.head(5)

In [None]:
df['labels'] = [0] * len(df)
df.head()

In [None]:
CSV = 'wiki2.csv'
# wiki = pd.read_csv(wiki_path/CSV)
df.to_csv(PATH/CSV, columns=['labels', 'text'], index=False)

In [None]:
wiki.head()

## IMDB

In [None]:
imdb_path = untar_data(URLs.IMDB)
imdb_path

In [None]:
CSV = 'texts.csv'
df = pd.read_csv(imdb_path/CSV)
len(df)

In [None]:
df['text'] = df.text.apply(lambda x: cleanup(x))

In [None]:
len(df)

In [None]:
# this also handles \n
text = [textwrap.wrap(line, 1000) for line in df.text.values]

In [None]:
flat_list = []
for sublist in text:
    for item in sublist:
        flat_list.append(item)

In [None]:
len(flat_list)

In [None]:
df = pd.DataFrame({'text': flat_list})

In [None]:
# remove small lines
df['char_len'] = df.text.map(len)
df = df[df['char_len'] > 15]
df.sort_values('char_len', inplace=True)
df.head(5)

In [None]:
CSV = 'imdb.csv'
imdb = pd.read_csv(imdb_path/CSV)
# df.to_csv(imdb_path/CSV, columns=['text', 'char_len'], index=False)

In [None]:
imdb.head()

## Combo

In [None]:
full = pd.concat([wiki, imdb], ignore_index=True)

In [None]:
full.sort_values('char_len', inplace=True)
full.reset_index(inplace=True, drop=True)
full.head(5)

In [None]:
len(full)

In [None]:
full['labels'] = [0] * len(full)
full.head()

In [None]:
PATH = Path('data/IAM_handwriting')
full.to_csv(PATH/'wiki103_imdb.csv', columns=['labels', 'text'], index=False)

### create csv from folders

In [None]:
for directory in os.listdir(PATH):
    if os.path.isdir(os.path.join(PATH, directory)):
        print(directory)

In [None]:
def add_to_df(path):
    df = pd.DataFrame()
    for file in os.listdir(path):
        with open(path/file,'r') as txt:
            df = df.append({'text': txt.read()}, ignore_index=True)
    return df

In [None]:
tst_neg = add_to_df(PATH/'test/neg')
tst_pos = add_to_df(PATH/'test/pos')
trn_neg = add_to_df(PATH/'train/neg')
trn_pos = add_to_df(PATH/'train/neg')
unsup = add_to_df(PATH/'unsup')

In [None]:
imdb = pd.concat([tst_neg, tst_pos, trn_neg, trn_pos, unsup], ignore_index=True)
len(imdb)

In [None]:
imdb.to_csv(PATH/'texts.csv', index=False)

## DataBunch

In [None]:
class CharTokenizer(BaseTokenizer):
    def tokenizer(self, t:str): return list(t)
    
class CharVocab(Vocab):
    def __init__(self, itos:Collection[str]):
        self.itos = itos
        self.stoi = collections.defaultdict(lambda: 3, {v:k for k,v in enumerate(self.itos)})

    def textify(self, nums:Collection[int], sep=''):
        return sep.join([self.itos[i] for i in nums]) if sep is not None else [self.itos[i] for i in nums]

In [None]:
PATH = Path('data/IAM_handwriting')
itos = pickle.load(open(PATH/'itos.pkl', 'rb'))

vocab = CharVocab(itos)
toknizr = Tokenizer(tok_func=CharTokenizer, pre_rules=[], post_rules=[],
                    special_cases=['xxbos','xxeos','xxunk','xxpad'])

In [None]:
CSV = 'wiki2.csv' #'wiki103_imdb.csv' #'wiki103.csv' #'wiki2.csv'
data = TextLMDataBunch.from_csv(PATH, CSV, tokenizer=toknizr, vocab=vocab, min_freq=1)

In [None]:
data.show_batch()

# v1 ULMFit

In [None]:
import Levenshtein as Lev

class CER(Callback):
    def __init__(self):
        super().__init__()
        self.name = 'cer'

    def on_epoch_begin(self, **kwargs):
        self.errors, self.total = 0, 0
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        error,size = cer(last_output, last_target)
        self.errors += error
        self.total += size
    
    def on_epoch_end(self, last_metrics, **kwargs):
        return add_metrics(last_metrics, self.errors/self.total)

def cer(preds, targs):
    bs = targs.size(0)
    res = torch.argmax(preds, dim=-1)
    error = 0
    for i in range(bs):
        p = char_label_text(res[i])   #.replace(' ', '')
        t = char_label_text(targs[i]) #.replace(' ', '')
        error += Lev.distance(t, p)/len(t)
    return error, bs

def char_label_text(pred, sep=''):
    ints = to_np(pred).astype(int)
    nonzero = ints[np.nonzero(ints)] #[:-1]  #remove eos token
    return sep.join([itos[i] for i in nonzero])

In [None]:
config = dict(emb_sz=512, n_hid=1400, n_layers=3, pad_token=0, qrnn=False, bidir=False, output_p=0.2,
              hidden_p=0.2, input_p=0.5, embed_p=0.1, weight_p=0.4, tie_weights=True, out_bias=True)

learn = language_model_learner(data, AWD_LSTM, config=config, drop_mult=0.5,
                               pretrained=False, metrics=[accuracy, CER()])

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
lr=1e-3
learn.fit_one_cycle(10, lr, callbacks=[SaveModelCallback(learn, name='wiki2_lm')])
# 2.607412	2.547618	0.282488   AWD-LSTM (pretrained - last layer only)
# 1.313570	1.279599	0.609183   AWD-LSTM (512/1400) 1cycle, 7e-3
# 1.153841	1.180336	0.646971   AWD-LSTM (512/1400) 3cycle, 1e-3   'wiki2_lm'
# 1.269907	1.208987	0.638706   AWD-LSTM (512/1400) 3cycle, 1e-3, +drops  'wiki2_lm2'
# 1.261620	1.199914	0.640806   2nd run - 1cycle, 1e-3
# 1.534081	1.457398	0.570164   AWD-LSTM (256/1024) 3cycle, 3e-2

# 10cycle, 1e-3
# 1.144401	1.092210	0.671697	0.327961   AWD-LSTM (512/1400)   'wiki2_lm'

# 1.206823	1.095210	0.668147	3:42:52   512/1400, wiki103, lr:5e-3, 1cycle   'wiki103_lm'

# 2.948277	2.855124	0.202343   Transformer

In [None]:
learn.save('wiki103_lm')
learn.save_encoder('wiki103_lm_enc')

In [None]:
learn.unfreeze()
# only pretrained are frozen!!
learn.fit_one_cycle(5, lr)

# AWD_LSTM
# lr: 1e-1[/10] - 5 cycles, 
# 1.175062	1.160626	0.641956   AWD-LSTM baseline (vanilla databunch, vanilla AWD-LSTM pretrained)  'wiki2_base'
# 3.044392	3.052639	0.164598   Transformer (fail and too slow)

# 1.159140	1.091614	0.659921   defaults (emb_sz=400, n_hid=1150, n_layers=3), lr: 1e-1    'wiki2_tmp'
# 1.398861	1.354885	0.595028   emb_sz=512, n_hid=1200, pad_token=0, lr: 3e-2
# 1.882478	1.760121	0.501928   emb_sz=512, n_hid=1024, pad_token=0, lr: 1e-1
# 1.188713	1.174986	0.638500   512/1400, pad=0, lr: 7e-3    'wiki2_3'
# 1.186098	1.165006	0.642184   emb_sz=400, n_hid=1152, pad_token=0, pretrained, moms=(0.8,0.7)  'wiki2_4'
# 1.157077	1.158786	0.643875   emb_sz=400, n_hid=1152, pretrained  'wiki2_5'
# 1.181280	1.164976	0.641465   emb_sz=400, n_hid=1152, pad_token=0, pretrained   'wiki2_6'
# 1.476931	1.368694	0.583658   emb_sz=256, n_hid=1024, pad_token=0

In [None]:
learn.load('wiki2_lm')

In [None]:
learn.predict("This is a re", n_words=50, sep='')

In [None]:
learn.save_encoder('wiki2_lm_enc')

# v1 Transformer

## Loss and Metrics

In [None]:
def loss_prep(input, target):
    "equalize input/target sl; combine bs/sl dimensions"
    bs,tsl = target.shape
    _ ,sl,vocab = input.shape
        
    # F.pad( front,back for dimensions: 1,0,2 )
    if sl>tsl: target = F.pad(target, (0,sl-tsl))
        
    # this should only be used when testing for small seq_lens
    # if tsl>sl: target = target[:,:sl]
    
    if tsl>sl: input = F.pad(input, (0,0,0,tsl-sl))
    # not ideal => adds 82 logits all 0s...
        
    targ = target.contiguous().view(-1).long()
    pred = input.contiguous().view(-1, vocab)
    return pred, targ

In [None]:
class LabelSmoothing(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        
    def forward(self, pred, target):
        pred,targ = loss_prep(pred, target)
        pred = F.log_softmax(pred, dim=-1)  # need this for KLDivLoss
        true_dist = pred.data.clone()
        true_dist.fill_(self.smoothing / pred.size(1))                  # fill with 0.0012
        true_dist.scatter_(1, targ.data.unsqueeze(1), self.confidence)  # [0.0012, 0.0012, 0.90, 0.0012]
        return F.kl_div(pred, true_dist, reduction='sum')/pred.size(0)  #bs

In [None]:
import Levenshtein as Lev

class CER(Callback):
    def __init__(self, itos):
        super().__init__()
        self.name = 'cer'
        self.itos = itos

    def on_epoch_begin(self, **kwargs):
        self.errors, self.total = 0, 0
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        error,size = self._cer(last_output, last_target)
        self.errors += error
        self.total += size
    
    def on_epoch_end(self, last_metrics, **kwargs):
        return add_metrics(last_metrics, self.errors/self.total)

    def _cer(self, preds, targs):
        bs,sl = targs.size()
        
        res = torch.argmax(preds, dim=2)
        error = 0
        for i in range(bs):
            p = self._char_label_text(res[i])   #.replace(' ', '')
            t = self._char_label_text(targs[i]) #.replace(' ', '')
            error += Lev.distance(t, p)/len(t)
        return error, bs

    def _char_label_text(self, pred):
        ints = to_np(pred).astype(int)
        nonzero = ints[np.nonzero(ints)]
        return ''.join([self.itos[i] for i in nonzero])

## Transformer Modules

In [None]:
LayerNorm = partial(nn.LayerNorm, eps=1e-4)  # accomodates mixed precision training

In [None]:
class SublayerConnection(nn.Module):
    "A residual connection followed by a layer norm.  Note: (for code simplicity) norm is first."
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [None]:
class EncoderLayer(nn.Module):
    "Encoder: self-attn and feed forward"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)

    def forward(self, x, mask=None):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    depth = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(depth)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e4)    
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [None]:
class SingleHeadedAttention(nn.Module):
    def __init__(self, d_model, dropout=0.2):
        super(SingleHeadedAttention, self).__init__()
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None):
        query, key, value = [l(x) for l, x in zip(self.linears, (query, key, value))]
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        return self.linears[-1](x)

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, d_model, h=8, dropout=0.2):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h        # assume d_v always equals d_k
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, q, k, v, mask=None):
        if mask is not None: mask = mask.unsqueeze(1)
        bs = q.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        q, k, v = [l(x).view(bs, -1, self.h, self.d_k).transpose(1,2) for l, x in zip(self.linears, (q, k, v))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(q, k, v, mask=mask, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(bs, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [None]:
class GeLU(nn.Module):
    def forward(self, x): return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, dropout=0.2):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_model*4)
        self.w_2 = nn.Linear(d_model*4, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = GeLU() #nn.ReLU(inplace=True)
        
    def forward(self, x):
        return self.w_2(self.dropout(self.activation(self.w_1(x))))

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=2000):
        super(PositionalEncoding, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0.0, max_len).unsqueeze(1)
        log_increment = math.log(1e4) / d_model
        div_term = torch.exp(torch.arange(0.0, d_model, 2) * -log_increment)  
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe.unsqueeze_(0)

        self.register_buffer('pe', pe)    #(1,max_len,d_model)
        # registered buffers are Tensors (not Variables)
        # not a parameter but still want in the state_dict

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [None]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

## Architecture

In [None]:
config = dict(emb_sz=512, n_hid=1400, n_layers=3, pad_token=0, qrnn=False, bidir=False, output_p=0.2,
              hidden_p=0.2, input_p=0.5, embed_p=0.1, weight_p=0.4, tie_weights=True, out_bias=True)

learn = language_model_learner(data, AWD_LSTM, config=config, drop_mult=0.5,
                               pretrained=False, metrics=[accuracy, CER()])

In [None]:
learn = LanguageLearner(data, model, split_func=meta['split_lm'], **learn_kwargs)

In [None]:
config = dict(ctx_len=512, n_layers=12, n_heads=8, d_model=768, d_head=64, d_inner=3072, resid_p=0.1, attn_p=0.1,
              ff_p=0.1, embed_p=0.1, output_p=0., bias=True, scale=True, act=Activation.GeLU, double_drop=False,
              tie_weights=True, out_bias=False, init=init_transformer, mask=True)

In [None]:
class Transformer(Module):
    def __init__(self, vocab_sz:int, ctx_len:int, n_layers:int, n_heads:int, d_model:int, d_head:int, d_inner:int,
                 resid_p:float=0., attn_p:float=0., ff_p:float=0., embed_p:float=0., bias:bool=True, scale:bool=True,
                 act:Activation=Activation.ReLU, double_drop:bool=True, attn_cls:Callable=MultiHeadAttention,
                 learned_pos_enc:bool=True, mask:bool=True):
        self.mask = mask
        self.encoder = nn.Embedding(vocab_sz, d_model)
        self.pos_enc = nn.Embedding(ctx_len, d_model) if learned_pos_enc else PositionalEncoding(d_model)
        self.drop_emb = nn.Dropout(embed_p)
        self.layers = nn.ModuleList([DecoderLayer(n_heads, d_model, d_head, d_inner, resid_p=resid_p, attn_p=attn_p,
                      ff_p=ff_p, bias=bias, scale=scale, act=act, double_drop=double_drop,
                      attn_cls=attn_cls) for k in range(n_layers)])

    def reset(self): pass

    def forward(self, x):
        bs, x_len = x.size()
        pos = torch.arange(0, x_len, device=x.device, dtype=x.dtype)
        inp = self.drop_emb(self.encoder(x) + self.pos_enc(pos)[None]) #.mul_(self.d_model ** 0.5)
        mask = torch.triu(x.new_ones(x_len, x_len), diagonal=1).byte()[None,None] if self.mask else None
        #[None,:,:None] for einsum implementation of attention
        for layer in self.layers: inp = layer(inp, mask=mask)
        return ([inp],[inp]) #For the LinearDecoder

In [None]:
class LM(nn.Module):
    def __init__(self, encoder, embed, generator):
        super(LM, self).__init__()
        self.encoder = encoder
        self.embed = embed
        self.generator = generator
        
    def forward(self, src, mask=None):
        return self.encoder(self.embed(src), mask)
    
    def generate(self, outs):
        return self.generator(outs)

In [None]:
def make_model(vocab, d_model=512, N=4, drop=0.2):
#     attn = SingleHeadedAttention(d_model)
    attn = MultiHeadedAttention(d_model, 8)
    ff = PositionwiseFeedForward(d_model, drop)
    
    model = LM(
        Encoder(EncoderLayer(d_model, attn, ff, drop), N),
        nn.Sequential(
            Embeddings(d_model, vocab), PositionalEncoding(d_model, drops, 2000)
        ),
        nn.Linear(d_model, vocab)
    )
        
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
                    
    return model

In [None]:
def make_learner(data, d_model, em_sz, N=4, drops=0.2, attn_type='multi', attn_heads=8):
    img_encoder = ResnetBase(em_sz, d_model)
    transformer = make_model(len(itos), d_model, N=N, drops=drops, attn_type=attn_type, attn_heads=attn_heads)
    net = Img2Seq(img_encoder, transformer)
    return Learner(data, net, loss_func=LabelSmoothing(smoothing=0.1),
                    metrics=[CER()], callback_fns=[TeacherForce])

In [None]:
# denoising auto-encoder
# Want to predict entire output including masked words

class LanguageModel(nn.Module):
    def __init__(self, lm):
        super(LanguageModel, self).__init__()
        self.lm = lm
    
    def reset(self): pass
    
    def forward(self, x):
        out = self.lm.generate(self.lm(x, subsequent_mask(x.size(-1))))
        return out, out, out

    def greedy_decode(self, x):
        with torch.no_grad():
            bs,sl = x.shape
            tgt = torch.ones((bs,1), dtype=torch.long, device=device)

            res = []
            for i in tqdm(range(sl+1)):
                mask = subsequent_mask(tgt.size(-1))
                dec_outs = self.lm(x, mask)
                prob = self.lm.generate(dec_outs[:,-1])
                res.append(prob)
                pred = torch.argmax(prob, dim=-1, keepdim=True)
                if (pred==0).all(): break
                tgt = torch.cat([tgt,pred], dim=-1)
            out = torch.stack(res).transpose(1,0).contiguous()
            return out     
        
def subsequent_mask(size):
    return torch.tril(torch.ones((size,size), device=device)).byte()[None]

In [None]:
d_model = 512
lm = make_model(len(itos), d_model)
net = LanguageModel(lm)

learn = LanguageLearner(data, net, loss_func=LabelSmoothing(smoothing=0.1), metrics=[accuracy, CER(itos)], clip=0.25)

## Fit

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
lr=1e-1
learn.fit_one_cycle(1, lr)
# 2.607412	2.547618	0.282488   AWD-LSTM
# 2.948277	2.855124	0.202343   Transformer

In [None]:
learn.unfreeze()
learn.fit_one_cycle(5, lr/10)#, moms=(0.8,0.7))

# AWD_LSTM
# lr: 1e-1[/10] - 5 cycles, 
# 1.175062	1.160626	0.641956   AWD-LSTM baseline (vanilla databunch, vanilla AWD-LSTM pretrained)  'wiki2_base'
# 3.044392	3.052639	0.164598   Transformer (fail and too slow)

# 1.159140	1.091614	0.659921   defaults (emb_sz=400, n_hid=1150, n_layers=3), lr: 1e-1    'wiki2_tmp'
# 1.398861	1.354885	0.595028   emb_sz=512, n_hid=1200, pad_token=0, lr: 3e-2
# 0.113181	0.077785	0.987423   emb_sz=512, n_hid=1024, pad_token=0, bidir=True, lr: 1e-1  ***fail
# 1.882478	1.760121	0.501928   emb_sz=512, n_hid=1024, pad_token=0, lr: 1e-1    'wiki2_3'
# 1.186098	1.165006	0.642184   emb_sz=400, n_hid=1152, pad_token=0, pretrained, moms=(0.8,0.7)  'wiki2_4'

# v0.7 Language Model Loader

## AWD-LSTM Language Model

In [None]:
wd=1e-7
bptt=30  # back prop through time
bs=50
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [None]:
trn_dl = LanguageModelLoader(trn_idx, bs, bptt)
val_dl = LanguageModelLoader(val_idx, bs, bptt)
md = LanguageModelData(PATH, 0, len(itos), trn_dl, val_dl, bs=bs, bptt=bptt)

In [None]:
# overfitting - increase multiplier (0.7)
# underfitting - decrease multiplier (0.7)
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [None]:
em_sz,nh,nl = 400,1150,3
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]

In [None]:
lr=1e-3
learner.fit(lr, 1, wds=wd, use_clr=(10,2), cycle_len=2, best_save_name='wiki103_lm')

# wikitext2  15cycle(20,10), 1e-3
# 1.18086    1.189847   0.646    fastai LM    'wiki2_lm'

# wikitext103
# 1.070876   1.001363   0.696245

In [None]:
learner.load('wiki103_lm')

## Transformer Language Model

In [None]:
class LMLoader():
    """ Returns a language model iterator that iterates through batches that are of length N(bptt,5)
    The first batch returned is always bptt+25; the max possible width.  This is done because of the way that pytorch
    allocates cuda memory in order to prevent multiple buffers from being created as the batch width grows.
    """
    def __init__(self, nums, vocab_len, bs, bptt):
        self.bs,self.bptt = bs,bptt
        self.vocab_len = vocab_len
        self.data = self.batchify(nums)
        self.i,self.iter = 0,0
        self.n = len(self.data)
        
    def __iter__(self):
        self.i,self.iter = 0,0
        while self.i < self.n-1 and self.iter<len(self):
            if self.i == 0:
                seq_len = self.bptt + 5 * 5
            else:
                bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
                seq_len = max(5, int(np.random.normal(bptt, 5)))
            res = self.get_pair(self.i, seq_len)
            self.i += seq_len
            self.iter += 1
            yield res

    def __len__(self): return self.n // self.bptt - 1
    
    def batchify(self, data):
        nb = data.shape[0] // self.bs        # integer division into batches
        data = np.array(data[:nb*self.bs])   # remove remainder
        data = data.reshape(self.bs, -1).T   # reshape and transpose
        return T(data)                       # output a tensor
        
    def get_pair(self, i, seq_len):
        source = self.data
        seq_len = min(seq_len, len(source) - 1 - i)
        return source[i:i+seq_len].transpose(1,0), source[i+1:i+seq_len+1].transpose(1,0)

In [None]:
bs, bptt = 50, 100

In [None]:
trn_dl = LMLoader(trn_idx, len(itos), bs, bptt)
val_dl = LMLoader(val_idx, len(itos), bs, bptt)

In [None]:
md = LanguageModelData(PATH, 0, len(itos), trn_dl, val_dl)

In [None]:
ii = iter(md.trn_dl)
x,y = next(ii)
x.shape, y.shape

In [None]:
char_label_text(x[2])

In [None]:
char_label_text(y[2])

## Denoising AutoEncoder LM

In [None]:
class DenoisingAutoEncoderLoader():
    """ Returns a language model iterator that iterates through batches that are of length N(bptt,5)
    The first batch returned is always bptt+25; the max possible width.  This is done because of the way that pytorch
    allocates cuda memory in order to prevent multiple buffers from being created as the batch width grows.
    """
    def __init__(self, nums, vocab_len, bs, bptt):
        self.bs,self.bptt = bs,bptt
        self.vocab_len = vocab_len
        self.data = self.batchify(nums)
        self.i,self.iter = 0,0
        self.n = len(self.data)
        
    def __iter__(self):
        self.i,self.iter = 0,0
        while self.i < self.n-1 and self.iter<len(self):
            if self.i == 0:
                seq_len = self.bptt + 5 * 5
            else:
                bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
                seq_len = max(5, int(np.random.normal(bptt, 5)))
            res = self.get_pair(self.i, seq_len)
            self.i += seq_len
            self.iter += 1
            yield res

    def __len__(self): return self.n // self.bptt - 1
    
    def batchify(self, data):
        nb = data.shape[0] // self.bs        # integer division into batches
        data = np.array(data[:nb*self.bs])   # remove remainder
        data = data.reshape(self.bs, -1).T   # reshape and transpose
        return data                          # output a tensor

    def get_pair(self, i, seq_len):
        seq_len = min(seq_len, self.n - 1 - i)
        arr = self.data[i:i+seq_len]    # (~125,50)

        # seq: (~bptt, bs)
        # need different scramble for each bs
        res,src = [],[]
        for b in range(arr.shape[1]):
            source = arr[:,b]
            
            # remove partial words from beginning and end
            spaces = np.where(source==1)[0]
            source = source[spaces[0]+1:spaces[-1]]
            src.append(source)
            
            seq = source.copy()
            
            # scramble
            num = random.randint(0, math.floor(self.bptt * 0.15))
            idxs = np.random.randint(len(seq), size=num)
            for i in idxs:
                if seq[i] not in [1,2]:   # don't modify ' ' or '\n'
                    prob = random.random()
                    if prob < 0.3:           #replacement
                        seq[i] = random.randrange(56,82) #self.vocab_len)  # only lowercase letters
                    elif 0.3 <= prob < 0.6:  #removal
                        seq[i] = 0
                    elif 0.6 <= prob < 0.9:  #addition
                        seq = np.insert(seq, i, random.randrange(56,82)) #self.vocab_len))
            mask = seq.nonzero()
            res.append(seq[mask])
        
        # convert res to 2d numpy array w/ zero padding
        out_res = np.zeros([len(res), len(max(res,key = lambda x: len(x)))], dtype=int)
        for i,j in enumerate(res):
            out_res[i][0:len(j)] = j
            
        out_src = np.zeros([len(src), len(max(src,key=lambda x: len(x)))], dtype=int)
        for i,j in enumerate(src):
            out_src[i][0:len(j)] = j
            
        return T(out_res), T(out_src)

In [None]:
bs, bptt = 50, 100

In [None]:
trn_dl = DenoisingAutoEncoderLoader(trn_idx, len(itos), bs, bptt)
val_dl = DenoisingAutoEncoderLoader(val_idx, len(itos), bs, bptt)

In [None]:
md = LanguageModelData(PATH, 0, len(itos), trn_dl, val_dl)

In [None]:
ii = iter(md.trn_dl)
x,y = next(ii)
x.shape, y.shape

In [None]:
char_label_text(x[2])

In [None]:
char_label_text(y[2])

# Helpers

## Loss and Metrics

In [None]:
def loss_prep(input, target):
    "equalize input/target sl; combine bs/sl dimensions"
    bs,tsl = target.shape
    _ ,sl,vocab = input.shape
        
    # F.pad( front,back for dimensions: 1,0,2 )
    if sl>tsl: target = F.pad(target, (0,sl-tsl))
        
    # this should only be used when testing for small seq_lens
    # if tsl>sl: target = target[:,:sl]
    
    if tsl>sl: input = F.pad(input, (0,0,0,tsl-sl))
    # not ideal => adds 82 logits all 0s...
        
    targ = target.contiguous().view(-1).long()
    pred = input.contiguous().view(-1, vocab)
    return pred, targ

In [None]:
class LabelSmoothing(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        
    def forward(self, pred, target):
        pred,targ = loss_prep(pred, target)
        pred = F.log_softmax(pred, dim=-1)  # need this for KLDivLoss
        true_dist = pred.data.clone()
        true_dist.fill_(self.smoothing / pred.size(1))                  # fill with 0.0012
        true_dist.scatter_(1, targ.data.unsqueeze(1), self.confidence)  # [0.0012, 0.0012, 0.90, 0.0012]
        return F.kl_div(pred, true_dist, reduction='sum')/bs

In [None]:
import Levenshtein as Lev

class CER(Callback):
    def __init__(self, itos):
        super().__init__()
        self.name = 'cer'
        self.itos = itos

    def on_epoch_begin(self, **kwargs):
        self.errors, self.total = 0, 0
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        error,size = self._cer(last_output, last_target)
        self.errors += error
        self.total += size
    
    def on_epoch_end(self, last_metrics, **kwargs):
        return add_metrics(last_metrics, self.errors/self.total)

    def _cer(self, preds, targs):
        bs,sl = targs.size()
        
        res = torch.argmax(preds, dim=2)
        error = 0
        for i in range(bs):
            p = self._char_label_text(res[i])   #.replace(' ', '')
            t = self._char_label_text(targs[i]) #.replace(' ', '')
            error += Lev.distance(t, p)/len(t)
        return error, bs

    def _char_label_text(self, pred):
        ints = to_np(pred).astype(int)
        nonzero = ints[np.nonzero(ints)]
        return ''.join([self.itos[i] for i in nonzero])

## Stepper

In [None]:
def subsequent_mask(size):
    attn_shape = torch.ones((size,size), dtype=torch.int, device=device)
    mask = torch.tril(attn_shape).unsqueeze(0)
    return mask

def make_tgt_mask(tgt, pad=0):
    "Create a mask to hide padding and future words."
    tgt_mask = (tgt != pad).unsqueeze(-2)
    tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
    return tgt_mask

In [None]:
def rshift(tgt, token=1):
    "Shift y to the right by prepending token"
    return torch.cat((torch.ones((tgt.size(0),token), device=device, dtype=torch.long), tgt[:,:-1]), dim=-1)

In [None]:
class TfmrStepper(Stepper):
    def step(self, xs, y, epoch):
        xtra = []
        shifted_y = rshift(y).long()
        tgt_mask = subsequent_mask(shifted_y.size(-1)) #make_tgt_mask(shifted_y)
        output = self.m(*xs, shifted_y, tgt_mask)
        
        if isinstance(output,tuple): output,*xtra = output
        self.opt.zero_grad()
        loss = raw_loss = self.crit(output, y)
        if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
        loss.backward()        
        if self.clip:   # Gradient clipping
            nn.utils.clip_grad_norm_(trainable_params_(self.m), self.clip)
        self.opt.step()
        return raw_loss.item()
    
    def evaluate(self, xs, y):
        shifted_y = rshift(y).long()
        tgt_mask = subsequent_mask(shifted_y.size(-1)) #make_tgt_mask(shifted_y)
        preds = self.m(*xs, shifted_y, tgt_mask)
        if isinstance(preds,tuple): preds=preds[0]
        return preds, self.crit(preds, y)

## Transformer Modules

In [None]:
# similar to batchnorm but on a layer level
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class SublayerConnection(nn.Module):
    "A residual connection followed by a layer norm.  Note: (for code simplicity) norm is first."
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)

In [None]:
class EncoderLayer(nn.Module):
    "Encoder: self-attn and feed forward"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)

    def forward(self, x):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x))
        return self.sublayer[1](x, self.feed_forward)

In [None]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, src, tgt_mask=None):
        for layer in self.layers:
            x = layer(x, src, tgt_mask)
        return self.norm(x)

In [None]:
class DecoderLayer(nn.Module):
    "Decoder: self-attn, src-attn, and feed forward"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)  # wraps layer in residual,dropout,norm
 
    def forward(self, x, src, tgt_mask=None):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, src, src))
        return self.sublayer[2](x, self.feed_forward)

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    depth = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(depth)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)    
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [None]:
class SingleHeadedAttention(nn.Module):
    def __init__(self, d_model, dropout=0.2):
        super(SingleHeadedAttention, self).__init__()
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None):
        query, key, value = [l(x) for l, x in zip(self.linears, (query, key, value))]
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        return self.linears[-1](x)

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, dropout=0.2, mult=4):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_model*mult)
        self.w_2 = nn.Linear(d_model*mult, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.2, max_len=2000):
        super(PositionalEncoding, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)
    
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0.0, max_len).unsqueeze(1)
        log_increment = math.log(1e4) / d_model
        div_term = torch.exp(torch.arange(0.0, d_model, 2) * -log_increment)  
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe.unsqueeze_(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return self.dropout(x)

## Architecture

In [None]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, tgt_mask=None):
        return self.decode(self.encode(src), tgt, tgt_mask)
    
    def encode(self, src):
        return self.encoder(self.src_embed(src))
    
    def decode(self, src, tgt, tgt_mask=None):
        return self.decoder(self.tgt_embed(tgt), src, tgt_mask)
    
    def generate(self, outs):
        return self.generator(outs)

In [None]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * 18

In [None]:
def make_language_model(vocab, d_model=512, N=4, drop=0.2):
    c = copy.deepcopy
    attn = SingleHeadedAttention(d_model)
#     attn = MultiHeadedAttention(d_model, 8)
    ff = PositionwiseFeedForward(d_model, drop)
    pos_enc = PositionalEncoding(d_model, drop, 2000)
    
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), drop), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), drop), N),
        nn.Sequential(nn.Embedding(vocab, d_model), pos_enc),
        nn.Sequential(nn.Embedding(vocab, d_model), pos_enc),
        nn.Linear(d_model, vocab)
    )
        
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
                    
    return model

In [None]:
# denoising auto-encoder
# Want to predict entire output including masked words

class BetterSpeller(nn.Module):
    def __init__(self, lm):
        super(BetterSpeller, self).__init__()
        self.lm = lm
        
    def forward(self, src, tgt=None, tgt_mask=None):
        return self.lm.generate(self.lm(src, tgt, tgt_mask))

    def greedy_decode(self, src):
        with torch.no_grad():
            feats = self.lm.encode(src)
            bs,sl = src.shape
            tgt = torch.ones((bs,1), dtype=torch.long, device=device)

            res = []                
            for i in tqdm(range(sl+5)):
                mask = subsequent_mask(tgt.size(-1))
                dec_outs = self.lm.decode(feats, Variable(tgt), Variable(mask))
                prob = self.lm.generate(dec_outs[:,-1])
                res.append(prob)
                pred = torch.argmax(prob, dim=-1, keepdim=True)
                if (pred==0).all(): break
                tgt = torch.cat([tgt,pred], dim=-1)
            out = torch.stack(res).transpose(1,0).contiguous()
            return out      

In [None]:
d_model = 512
lm = make_language_model(len(itos), d_model)
net = BetterSpeller(lm)

wd=1e-7
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

learn = Learner(md, BasicModel(to_gpu(net)), opt_fn=opt_fn)

learn.clip = 0.25
learn.crit = LabelSmoothing(smoothing=0.1)
learn.metrics = [char_error_rate]

# LM

In [None]:
learn.lr_find(stepper=TfmrStepper)
learn.sched.plot(n_skip=0, n_skip_end=2)

In [None]:
lr=1e-3
learn.fit(lr, 1, wds=wd, use_clr=(20,10), cycle_len=1, stepper=TfmrStepper, best_save_name='better_speller_103')

# wikitext2
# 39.696276  38.299323  0.549255   0.535159     15cycles

# 6.512353   5.655003   0.035785   tfmr  15cycles(20,10)   'better_speller'


# wikitext103
# 10.755489  9.274058   0.029507   tfmr  1cycle(20,10)   'LM_103'
# 7.036011   7.017283   0.022698   2nd cycle

In [None]:
learn.save('LM_103')

In [None]:
learn.load('LM_103')

In [None]:
x,y = next(iter(md.val_dl))

In [None]:
learn.model.eval()
preds = learn.model.greedy_decode(x)

In [None]:
probs = torch.argmax(preds, dim=-1)

In [None]:
idx=5

In [None]:
char_label_text(x[idx])

In [None]:
char_label_text(probs[idx])

In [None]:
char_label_text(y[idx])

In [None]:
lr=2e-5
learn.fit(lr, 1, wds=wd, use_clr=(20,10), cycle_len=10)

In [None]:
preds = learn.model(x)

In [None]:
char_error_rate(preds,y)

In [None]:
accuracy(preds,y)

# Test

In [None]:
learner.model.eval()
learner.model.training

In [None]:
def next_with_creativity(preds, k=5, thresh=.05):
    probs, idxs = torch.topk(F.softmax(preds, dim=-1), k, dim=-1)
    d = {itos[k]: round(v.item(), 3) for k,v in zip(idxs,probs)}
    print(d)
    
    seq = np.array([], dtype=np.long)
    for p,i in zip(probs,idxs):
        num = int(p * 100)
        seq = np.append(seq, [i.item()] * num)
    
    return random.choice(seq.flatten())
    
#     return{k:v if v>=thresh else None for k,v in d}
#     mask = [probs >= thresh] 
#     m_probs, m_idxs = probs[mask], idxs[mask]
    
#     if len(m_idxs) > 0:
#         # simple weighted choice
#         seq = 
#         random.choice(seq)
#         idx = random.randint(0,len(m_idxs))
#         return m_idxs[idx]
#     else:
#         return idxs[0]

In [None]:
def get_next(inp):
    idxs = T(np.array([stoi[c] for c in inp])).unsqueeze(0)
    p = learner.model(Variable(idxs))
#     i = torch.argmax(p[0][-1], dim=-1)
#     i = torch.multinomial(p[0].exp(), 1)[-1]
    i = next_with_creativity(p[0][-1])
    return itos[i.item()]

In [None]:
get_next('whe')

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(res)
        res += c
    return res

In [None]:
get_next_n('th', 10)