# Attention Architecture

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import music21

In [3]:
# from fastai.text import *
from enum import Enum
import torch
from fastai.text.models.awd_lstm import *
from fastai.text.models.transformer import *

In [4]:
import numpy as np
import torch.nn as nn

In [5]:
np.set_printoptions(edgeitems=10, threshold=40, linewidth=200)

In [6]:
import sys
sys.path.insert(0, '../../')
from src.fastai_data import *
from src.encode_data import *
from src.serve import *

In [7]:
from src.music_transformer import *

In [8]:
from fastai.text.models.transformer import _line_shift

In [9]:
config = v15s_config(vocab)

In [10]:
config['mem_len'] = 0
config['mask'] = False
config['bs'] = 4
config['ctx_len'] = 1024
config['bptt'] = 1024

In [11]:
# _model_meta[MusicTransformer] = _model_meta[TransformerXL]
# _model_meta[MusicTransformer]['config_lm'] = config

In [12]:
config

{'ctx_len': 1024,
 'n_layers': 16,
 'n_heads': 8,
 'd_model': 256,
 'd_head': 32,
 'd_inner': 2048,
 'resid_p': 0.1,
 'attn_p': 0.1,
 'ff_p': 0.1,
 'embed_p': 0.1,
 'output_p': 0.1,
 'bias': False,
 'scale': True,
 'act': <Activation.GeLU: 3>,
 'double_drop': True,
 'tie_weights': True,
 'out_bias': True,
 'init': <function fastai.text.models.transformer.init_transformer(m)>,
 'mem_len': 0,
 'mask': False,
 'pad_idx': 1,
 'bos_idx': 0,
 'sep_idx': 8,
 'transpose_range': (0, 12),
 'note_range': (9, 138),
 'bs': 4,
 'bptt': 1024,
 'vocab_size': 274}

In [14]:
TaskType = Enum('TaskType', 'MaskOnly, NextSent, Translate, NextWord')

## BERT Dataloading

In [15]:
# BERT Transform
def next_sentence_ranges(x, y, max_cls=4):
    bs,bptt = x.shape
    s = min(random.randint(1, max_cls), bs-2)
    
    min_seq_len = bptt // s

    bs_shift = [0]+(np.random.choice(bs-1, s, replace=False)+1).tolist()
    row_shift = [int(min_seq_len + random.randint(-min_seq_len, min_seq_len)//s) for i in range(s)]
    
    accum = 0
    ranges = []
    for i in range(s):
        end = accum + row_shift[i] if i < (s-1) else bptt
        ranges.append((i, bs_shift[i], accum, end))
        accum = end
    return ranges

def next_sentence_tfm(b, max_cls=4):
    x, y = b
    x_new = x.clone()
    y_new = y.clone()
    z = torch.zeros_like(x)
    ranges = next_sentence_ranges(x, y, max_cls)
    for i,shift,s,e in ranges:
        if i == 0: continue
        x_new[:, s:e] = torch.roll(x, shifts=shift, dims=0)[:, s:e]
        y_new[:, s:e] = torch.roll(y, shifts=shift, dims=0)[:, s:e]
        z[:, s:e] = i
    return (x_new, TaskType.NextSent.value), (y_new, z)

def mask_tfm(b, word_range=vocab.npenc_range, pad_idx=vocab.pad_idx, mask_idx=vocab.mask_idx, p=0.2, double=False, mask_last=False):
    # p = replacement probability
    # double = mask 2 sequences at once
    # y is ignored
#     y = x.clone()
    x,y = b
    rand = torch.rand(x.shape, device=x.device)
    rand[x < word_range[0]] = 1.0
    if mask_last: rand[-1] = 0.0
    y[rand > p] = pad_idx
    x[rand <= (p*.8)] = mask_idx # 80% = mask
    wrong_word = (rand > (p*.8)) & (rand <= (p*.9)) # 10% = wrong word
    x[wrong_word] = torch.randint(*word_range, [wrong_word.sum().item()], device=x.device)
    return x, y

In [16]:
path = Path('../../data/midi/v15/piano_duet/')
dl_tfms = [mask_tfm, next_sentence_tfm]
ns_data = load_music_data(path, cache_name='tmp/sample', vocab=vocab, y_offset=0, dl_tfms=dl_tfms, **config)

## S2S Dataloading

In [17]:
def avg_tempo(t, sep_idx=0):
    avg = t[t[:, 0] == sep_idx][:, 1].sum()/t.shape[0]
    return 'mt'+str(int(max(round(avg), 4)))

In [18]:
class S2SPreloader(Callback):
    def __init__(self, dataset:LabelList, bptt:int=512, y_offset=1, **kwargs):
        # y_offset = extra padding for translation
        self.dataset,self.bptt = dataset,bptt
        self.vocab = vocab
        self.y_offset = y_offset
        self.single_tfm = partial(to_single_stream, vocab=vocab)
        self.transpose_tfm = partial(rand_transpose, note_range=vocab.note_range, rand_range=(0,12))
    
    def __getitem__(self, k:int):
        item,_ = self.dataset[k]
        x,y = item
        
        melody_meta = np.array([self.vocab.stoi[MSEQ], self.vocab.stoi[avg_tempo(x)]]) # pad should be average notes - tempo
        chord_meta = np.array([self.vocab.stoi[CSEQ], self.vocab.stoi[avg_tempo(y)]])
        x = self.single_tfm(x, start_seq=melody_meta)
        y = self.single_tfm(y, start_seq=chord_meta)
        
        x,y = self.transpose_tfm((x,y))
        
        x = np.pad(x, (0,max(0,self.bptt-len(x))), 'constant', constant_values=vocab.pad_idx)[:self.bptt]
        y = np.pad(y, (self.y_offset,max(0,self.bptt-len(y))), 'constant', constant_values=vocab.pad_idx)[:self.bptt+1]
        return x, y
    
    def __len__(self):
        return len(self.dataset)

In [19]:
# preloader itself contains all the transforms
def mask_s2s_tfm(b, word_range=vocab.npenc_range, pad_idx=vocab.pad_idx, 
             mask_idx=vocab.mask_idx, p=0.1, double=False, mask_last=False):
    x,y_s2s = b
    x_mask,y_mask = mask_tfm((x,x.clone()))
    return (x,TaskType.Translate.value,y_s2s[:,:-1]),(y_mask,y_s2s[:,1:])

In [20]:
dl_tfms = [mask_s2s_tfm]

path = Path('../../data/midi/v15/s2s_encode/')
s2s_data = MusicDataBunch.load(path, bs=config['bs'], cache_name='tmp/sample', 
                           preloader_cls=S2SPreloader, dl_tfms=[mask_s2s_tfm], y_offset=1,
                           shuffle_dl=True)


### Load NextWord dataset (Language Model)

In [21]:
def nw_tfm(b):
    x,y_nw = b
    x_mask,y_mask = mask_tfm((x,x.clone()))
    return (x_mask,TaskType.NextWord.value,x),(y_mask,y_nw) 
    
path = Path('../../data/midi/v15/piano_duet/')

dl_tfms = [nw_tfm]
nw_data = load_music_data(path, cache_name='tmp/sample', vocab=vocab, dl_tfms=dl_tfms, y_offset=1, **config)


## Double Load Data

In [22]:
class BertTrainer(LearnerCallback):
    "`Callback` that regroups lr adjustment to seq_len, AR and TAR."
    def __init__(self, learn:Learner, ns_data, s2s_data, nw_data):
        super().__init__(learn)
        self.ns_data = ns_data
        self.s2s_data = s2s_data
        self.nw_data = nw_data
        self.count = 0
    
    def on_epoch_end(self, last_metrics, **kwargs):
        "Finish the computation and sends the result to the Recorder."
        if self.count % 3 == 0:
            print('Switching to next sentence data')
            self.learn.data = self.ns_data
        elif self.count % 3 == 1:
            print('Switching to translate data')
            self.learn.data = self.s2s_data
        else:
            print('Switching to next word data')
            self.learn.data = self.nw_data
        self.count += 1

## LMNP

In [23]:
# m_len = 0
# x_len = 16 # bptt
# seq_len = m_len+x_len
# torch.triu(torch.ones(x_len, seq_len), diagonal=m_len).byte()[None,None].cpu().numpy()
# torch.triu(torch.ones(x_len, seq_len), diagonal=m_len+1).byte()[None,None].cpu().numpy()

In [24]:
import torch.nn as nn

In [25]:
class TransformerEmbedding(nn.Module):
    "Embedding + positional encoding + dropout"
    def __init__(self, vocab_sz:int, emb_sz:int, inp_p:float=0.):
        super().__init__()
        self.emb_sz = emb_sz
        self.embed = embedding(vocab_sz, emb_sz)
        self.pos_enc = PositionalEncoding(emb_sz)
        self.drop = nn.Dropout(inp_p)
    
    def forward(self, inp):
        pos = torch.arange(0, inp.size(1), device=inp.device).float()
        return self.drop(self.embed(inp)), self.pos_enc(pos)

In [26]:
class MusicTransformer(nn.Module):
    "TransformerXL model: https://arxiv.org/abs/1901.02860."
    def __init__(self, embed:nn.Module, n_layers:int, n_heads:int, d_model:int, d_head:int, d_inner:int, 
                 resid_p:float=0., attn_p:float=0., ff_p:float=0., bias:bool=False, scale:bool=True,
                 act:Activation=Activation.ReLU, double_drop:bool=True, attn_cls:Callable=MultiHeadRelativeAttention,
                 learned_pos_enc:bool=False, mask:bool=True, **kwargs):
        super().__init__()
        self.encoder = embed
        self.u = nn.Parameter(torch.Tensor(n_heads, 1, d_head)) #Remove 1 for einsum implementation of attention
        self.v = nn.Parameter(torch.Tensor(n_heads, 1, d_head)) #Remove 1 for einsum implementation of attention
        self.n_layers,self.d_model,self.mask = n_layers,d_model,mask
        self.layers = nn.ModuleList([DecoderLayer(n_heads, d_model, d_head, d_inner, resid_p=resid_p, attn_p=attn_p,
                      ff_p=ff_p, bias=bias, scale=scale, act=act, double_drop=double_drop, 
                      attn_cls=attn_cls) for k in range(n_layers)])
        
        nn.init.normal_(self.u, 0., 0.02)
        nn.init.normal_(self.v, 0., 0.02)
    
    def forward(self, x):
        bs,x_len = x.size()
        inp, pos_enc = self.encoder(x)

        mask = lm_mask(x_len, x.device) if self.mask else None
        
        for i, layer in enumerate(self.layers):
            inp = layer(inp, r=pos_enc, u=self.u, v=self.v, mask=mask, mem=None)
        core_out = inp[:,-x_len:]
        return core_out

In [27]:
class BertHead(nn.Module):
    def __init__(self, encoder, mask_decoder, ns_decoder, s2s_decoder):
        super().__init__()
        self.encoder = encoder
        self.mask_decoder = mask_decoder
        self.ns_decoder = ns_decoder
        self.s2s_decoder = s2s_decoder
        
    def forward(self, x, task_type=None, y=None):
#         x_emb = self.embed(x)
        self.encoder.mask = task_type == TaskType.NextWord.value # mask encoder for next word (so decoder can't cheat)
        x_enc = self.encoder(x)
        
        if task_type == TaskType.NextSent.value: # mask, and next sentence task
            return self.mask_decoder(x_enc), task_type, self.ns_decoder(x_enc)
        if task_type == TaskType.Translate.value:
            return self.mask_decoder(x_enc), task_type, self.s2s_decoder(x_enc, y)
        if task_type == TaskType.NextWord.value: # use same translation decoder
            return self.mask_decoder(x_enc), task_type, self.s2s_decoder(x_enc, y)
        return self.mask_decoder(x_enc), task_type
    
    def __getitem__(self, idx):
        return [self.encoder, self.mask_decoder, self.ns_decoder, self.s2s_decoder][idx]
        
    "A sequential module that passes the reset call to its children."
    def reset(self): pass

In [28]:
def window_mask(x_len, device, m_len=0, size=(1,1)):
    win_size,k = size
    mem_mask = np.zeros((x_len,m_len))
    tri_mask = np.triu(np.ones((x_len//win_size+1,x_len//win_size+1)),k=k)
    window_mask = tri_mask.repeat(win_size,axis=0).repeat(win_size,axis=1)[:x_len,:x_len]
    np_mask = np.concatenate((mem_mask, window_mask), axis=1)
    mask = torch.tensor(np_mask, device=device).byte()[None,None]
    return mask
    
def rand_window_mask(x_len,m_len,device,max_size=3,p=0.2,is_eval=False):
    if is_eval or m_len == 0 or np.random.rand() >= p: 
        win_size,k = (1,1)
    else: win_size,k = (np.random.randint(0,max_size)+1,0)
    return window_mask(x_len, device, m_len, size=(win_size,k))

In [29]:
def lm_mask(x_len, device):
    return torch.triu(torch.ones((x_len, x_len), device=device), diagonal=1)[None,None].byte()

In [30]:

class KVMultiHeadRelativeAttention(nn.Module):
    "MutiHeadAttention with relative positional encoding."
    def __init__(self, n_heads:int, d_model:int, d_head:int=None, resid_p:float=0., attn_p:float=0., bias:bool=True,
                 scale:bool=True):
        super().__init__()
        d_head = ifnone(d_head, d_model//n_heads)
        self.n_heads,self.d_head,self.scale = n_heads,d_head,scale
        
        self.q_wgt = nn.Linear(d_model, n_heads * d_head, bias=bias)
        self.k_wgt = nn.Linear(d_model, n_heads * d_head, bias=bias)
        self.v_wgt = nn.Linear(d_model, n_heads * d_head, bias=bias)
        
        self.out = nn.Linear(n_heads * d_head, d_model, bias=bias)
        self.drop_att,self.drop_res = nn.Dropout(attn_p),nn.Dropout(resid_p)
        self.ln = nn.LayerNorm(d_model)
        self.r_attn = nn.Linear(d_model, n_heads * d_head, bias=bias)
        
    def forward(self, q:Tensor, k:Tensor, v:Tensor, 
                r:Tensor=None, g_u:Tensor=None, g_v:Tensor=None, 
                mask:Tensor=None, **kwargs):
        return self.ln(q + self.drop_res(self.out(self._apply_attention(q, k, v, r, g_u, g_v, mask=mask, **kwargs))))
    
    def _apply_attention(self, q:Tensor, k:Tensor, v:Tensor, 
                         r:Tensor=None, g_u:Tensor=None, g_v:Tensor=None, 
                         mask:Tensor=None):
        #Notations from the paper: x input, r vector of relative distance between two elements, u et v learnable
        #parameters of the model common between all layers, mask to avoid cheating and mem the previous hidden states.
        bs,x_len,seq_len = q.size(0),q.size(1),r.size(0)
        wq,wk,wv = self.q_wgt(q),self.k_wgt(k),self.v_wgt(v)
        wq = wq[:,-x_len:]
        wq,wk,wv = map(lambda x:x.view(bs, x.size(1), self.n_heads, self.d_head), (wq,wk,wv))
        wq,wk,wv = wq.permute(0, 2, 1, 3),wk.permute(0, 2, 3, 1),wv.permute(0, 2, 1, 3)
        wkr = self.r_attn(r)
        wkr = wkr.view(seq_len, self.n_heads, self.d_head)
        wkr = wkr.permute(1,2,0)
        #### compute attention score (AC is (a) + (c) and BS is (b) + (d) in the paper)
        AC = torch.matmul(wq+g_u,wk)
        BD = _line_shift(torch.matmul(wq+g_v, wkr))
        if self.scale: attn_score = (AC + BD).mul_(1/(self.d_head ** 0.5))
        if mask is not None: 
            attn_score = attn_score.float().masked_fill(mask, -float('inf')).type_as(attn_score)
        attn_prob = self.drop_att(F.softmax(attn_score, dim=-1))
        attn_vec = torch.matmul(attn_prob, wv)
        return attn_vec.permute(0, 2, 1, 3).contiguous().view(bs, x_len, -1)

In [31]:
class S2SDecoderBlock(nn.Module):
    "Decoder block of a Transformer model."
    #Can't use Sequential directly cause more than one input...
    def __init__(self, n_heads:int, d_model:int, d_head:int, d_inner:int, resid_p:float=0., attn_p:float=0., ff_p:float=0.,
                 bias:bool=True, scale:bool=True, double_drop:bool=True, **kwargs):
        super().__init__()
        self.mha1 = KVMultiHeadRelativeAttention(n_heads, d_model, d_head, resid_p=resid_p, attn_p=attn_p, bias=bias, scale=scale)
        self.mha2 = KVMultiHeadRelativeAttention(n_heads, d_model, d_head, resid_p=resid_p, attn_p=attn_p, bias=bias, scale=scale)
        self.ff   = feed_forward(d_model, d_inner, ff_p=ff_p, double_drop=double_drop)
    
    def forward(self, targ:Tensor, enc:Tensor, 
                r=None, u=None, v=None,
                mask_in:Tensor=None, mask_out:Tensor=None): 
        y = self.mha1(targ, targ, targ, r, u, v, mask=mask_out)
        return self.ff(self.mha2(y, enc, enc, r, u, v, mask=mask_in))

In [32]:
class S2SDecoder(nn.Module):
    def __init__(self, embed:nn.Module, n_hid:int, vocab_sz:int, n_layers:int, n_heads:int, d_model:int, d_head:int, d_inner:int, 
                 resid_p:float=0., attn_p:float=0., ff_p:float=0., bias:bool=False, scale:bool=True,
                 act:Activation=Activation.ReLU, double_drop:bool=True, attn_cls:Callable=MultiHeadRelativeAttention,
                 learned_pos_enc:bool=False, mask:bool=True, **kwargs):
        super().__init__()
        self.encoder = embed
        self.u = nn.Parameter(torch.Tensor(n_heads, 1, d_head)) #Remove 1 for einsum implementation of attention
        self.v = nn.Parameter(torch.Tensor(n_heads, 1, d_head)) #Remove 1 for einsum implementation of attention
        self.n_layers,self.d_model,self.mask = n_layers,d_model,mask
        self.layers = nn.ModuleList([S2SDecoderBlock(n_heads, d_model, d_head, d_inner, resid_p=resid_p, attn_p=attn_p,
                      ff_p=ff_p, bias=bias, scale=scale, act=act, double_drop=double_drop, 
                      attn_cls=attn_cls) for k in range(n_layers)])
        self.head = MusicLinearDecoder(d_model, vocab_sz, tie_encoder=embed.embed, **kwargs)
    
        nn.init.normal_(self.u, 0., 0.02)
        nn.init.normal_(self.v, 0., 0.02)
        
    def forward(self, enc, targ):
        # x = encoder, y = target
        bs,targ_len = targ.size()
        
        targ_emb, pos_enc = self.encoder(targ)

#         mask = window_mask(x_len, x.device) if self.mask else None
        mask_out = lm_mask(targ_len, targ.device)
        
        for i, layer in enumerate(self.layers):
            targ_emb = layer(targ_emb, enc, mask_out=mask_out,
                        r=pos_enc, u=self.u, v=self.v)
        return self.head(targ_emb)

In [33]:

class MusicLinearDecoder(nn.Module):
    "To go on top of a RNNCore module and create a Language Model."
    initrange=0.1

    def __init__(self, n_hid:int, n_out:int, output_p:float, tie_encoder:nn.Module=None, bias:bool=True, **kwargs):
        super().__init__()
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.output_dp = RNNDropout(output_p)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input:Tuple[Tensor,Tensor])->Tuple[Tensor,Tensor,Tensor]:
        output = self.output_dp(input)
        decoded = self.decoder(output)
        return decoded


In [34]:
def get_music_model(vocab_sz:int, config:dict=None, drop_mult:float=1.):
    "Create a language model from `arch` and its `config`, maybe `pretrained`."
    for k in config.keys(): 
        if k.endswith('_p'): config[k] *= drop_mult
#     tie_weights,output_p,out_bias = map(config.pop, ['tie_weights', 'output_p', 'out_bias'])
    tie_weights,output_p,out_bias = map(config.get, ['tie_weights', 'output_p', 'out_bias'])
    n_hid = config['d_model']
    embed = TransformerEmbedding(vocab_sz, n_hid, inp_p=config['embed_p'])
    encoder = MusicTransformer(embed=embed, **config)
    mask_decoder = MusicLinearDecoder(n_hid, vocab_sz, output_p, tie_encoder=embed.embed, bias=out_bias)
    ns_decoder = MusicLinearDecoder(n_hid, 4, output_p, tie_encoder=None, bias=out_bias)
    s2s_decoder = S2SDecoder(embed, n_hid, vocab_sz, **config)
    model = BertHead(encoder, mask_decoder, ns_decoder, s2s_decoder)
    return model.apply(init_transformer)


def music_model_learner(data:DataBunch, config:dict=None, drop_mult:float=1., pretrained:bool=False,
                        pretrained_fnames:OptStrTuple=None, **learn_kwargs) -> 'LanguageLearner':
    "Create a `Learner` with a language model from `data` and `arch`."
    model = get_music_model(config['vocab_size'], config=config, drop_mult=drop_mult)
    learn = MusicLearner(data, model, split_func=tfmerXL_lm_split, 
                         bos_idx=config['bos_idx'], sep_idx=config['sep_idx'],
                        **learn_kwargs)
    
    if pretrained:
        if 'url' not in meta: 
            warn("There are no pretrained weights for that architecture yet!")
            return learn
        model_path = untar_data(meta['url'], data=False)
        fnames = [list(model_path.glob(f'*.{ext}'))[0] for ext in ['pth', 'pkl']]
        learn.load_pretrained(*fnames)
        learn.freeze()
    if pretrained_fnames is not None:
        fnames = [learn.path/learn.model_dir/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]
        learn.load_pretrained(*fnames)
        learn.freeze()
    return learn

## Load

In [35]:
learn = music_model_learner(s2s_data, config.copy())

Sep_idx: 8


In [36]:
class BertLoss():
    def __init__(self, mask_loss, sent_loss, s2s_loss):
        self.mask_loss = mask_loss
        self.sent_loss = sent_loss
        self.s2s_loss = s2s_loss
        
    def __call__(self, input:Tensor, target:Tensor, target_2:Tensor, **kwargs)->Rank0Tensor:
        x_mask, task_type, x_task = input
        m = self.mask_loss.__call__(x_mask, target, **kwargs)
        
        if task_type == TaskType.NextSent.value: s = self.sent_loss.__call__(x_task, target_2, **kwargs)
        elif task_type == TaskType.Translate.value: s = self.s2s_loss.__call__(x_task, target_2, **kwargs)
        else: s = 0

        return m + s

In [37]:
# learn.callbacks = [BertTrainer(learn, alpha=2, beta=1)]
learn.callbacks = [BertTrainer(learn, ns_data, s2s_data, nw_data)]

In [38]:
learn.loss_func = BertLoss(CrossEntropyFlat(ignore_index=vocab.pad_idx), CrossEntropyFlat(), CrossEntropyFlat(ignore_index=vocab.pad_idx))

In [39]:
def acc_ignore_pad(input:Tensor, targ:Tensor, pad_idx)->Rank0Tensor:
    n = targ.shape[0]
    input = input.argmax(dim=-1).view(n,-1)
    targ = targ.view(n,-1)
    mask = targ != pad_idx
    return (input[mask]==targ[mask]).float().mean()

def mask_acc(input:Tensor, t1:Tensor, t2:Tensor)->Rank0Tensor:
    return acc_ignore_pad(input[0], t1, vocab.pad_idx)

def s2s_acc(input:Tensor, t1:Tensor, t2:Tensor)->Rank0Tensor:
    x_mask, task_type, x_task = input
    if task_type != TaskType.Translate.value: return torch.tensor(0)
    return acc_ignore_pad(x_task, t2, vocab.pad_idx)

def nw_acc(input:Tensor, t1:Tensor, t2:Tensor)->Rank0Tensor:
    x_mask, task_type, x_task = input
    if task_type != TaskType.NextWord.value: return torch.tensor(0)
    return acc_ignore_pad(x_task, t2, vocab.pad_idx)

def ns_acc(input:Tensor, t1:Tensor, t2:Tensor)->Rank0Tensor:
    x_mask, task_type, x_task = input
    if task_type != TaskType.NextSent.value: return torch.tensor(0)
    return accuracy(input[-1], t2)

In [40]:
learn.metrics = [mask_acc, ns_acc, s2s_acc, nw_acc]

In [44]:
learn.validate()

Switching to next sentence data


[5.561759, tensor(0.0982), tensor(0), tensor(0), tensor(0.0001)]

In [50]:
# cur_idx = 0
# a,b = None, None
# for i,(xb,yb) in progress_bar(enumerate(iter(data.train_dl)), total=len(data.train_dl)):
#     cur_idx = i
#     a,b = xb,yb
#     if i == 273: break
#     learn.model(*xb)
# cur_idx
# a[0].min(), a[0].max(), a[2].min(), a[2].max()

In [51]:
# learn.lr_find()
# learn.recorder.plot()

In [43]:
learn.data.one_batch()

([tensor([[ 17,   8, 143,  ...,  71, 143,   4],
          [151,   8, 143,  ...,  67, 143,  67],
          [  4,   8, 143,  ...,   4, 143,  69],
          [151,   8, 143,  ...,  70, 143,  70]]), 2],
 [tensor([[151,   1,   1,  ...,   1,   1,  71],
          [  1,   1,   1,  ...,   1,   1,  67],
          [151,   1,   1,  ...,  69,   1,   1],
          [  1,   1,   1,  ...,   1,   1,   1]]),
  tensor([[0, 0, 0,  ..., 1, 1, 1],
          [0, 0, 0,  ..., 1, 1, 1],
          [0, 0, 0,  ..., 1, 1, 1],
          [0, 0, 0,  ..., 1, 1, 1]])])

In [45]:
learn.fit_one_cycle(3, 1e-4)

epoch,train_loss,valid_loss,mask_acc,ns_acc,s2s_acc,nw_acc,time
0,3.157253,2.958767,0.53483,0.523118,0.0,0.0,03:19
1,5.328435,4.981651,0.521317,0.0,0.180515,0.0,00:40


Switching to translate data
Switching to next word data


RuntimeError: CUDA out of memory. Tried to allocate 130.00 MiB (GPU 0; 15.75 GiB total capacity; 14.83 GiB already allocated; 36.88 MiB free; 179.20 MiB cached)