In [None]:
#default_exp experimental

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
from fastai.text.all import *
from fastai.callback import *

from transformers_sandbox.all import *

# Experimental blocks and features
> Place where things develope before departing to relevant modules

In [None]:
#export
class CharLMConfig(ConfigBase):
    "Config for quick char-level LM experiments"
    _model = TransformerLM
    _d = {
        'vocab_sz':256,
        'd_model':512,
        'n_layers':6,
        'n_heads':8,
        'd_ff':4096,
        'attn_dropout':0.1,
        'ff_dropout':0.1,
        'emb_dropout':0.1,
        'tie_weights':True,
        'causal':True,
        'pos_enc':'absolute',
        'max_seq_len':512,
        'axial_shape':None,
        'axial_emb_dims':None,
        'pad_idx':None,
        'prenorm':False,
        'attn_bias':False,
        'shared_qk':False,
    }
    @update_sig(_d)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

## FixUp init

### Transformer w/o LayerNorm

In [None]:
#export
class TransformerEncoderBlockNLN(Module):
    """
    tmp
    Bacis transformer encoder block. Consists of multi-head attention and positional 
    feedforward layers
    """
    def __init__(self,
                 d_model:int, 
                 n_heads:int = 8, 
                 d_ff:int = None, 
                 attn_dropout:float = 0.1,
                 ff_dropout:float = 0.1,
                 causal:bool = False, 
                 attn_bias:bool = False,
                 prenorm:bool=False,
                 shared_qk:bool=False):
        store_attr('attn_dropout') # mb separate argument attn_post_dropout
        self.attn = Residual(Attention(d_model, n_heads=n_heads, causal=causal, dropout=attn_dropout, bias=attn_bias, shared_qk=shared_qk))
        self.ff = Residual(FeedForward(d_model, d_ff=d_ff, dropout=ff_dropout))
        
    def forward(self, x, mask=None):
        out = self.attn(x, mask=mask)
        return self.ff(out)

In [None]:
bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockNLN(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

In [None]:
#export
class TransformerEncoderNLN(Module):
    """Stack of TransformerEncoderBlocks"""
    def __init__(self, 
                 d_model, 
                 n_layers=6, 
                 n_heads=8, 
                 d_ff=None,
                 ff_dropout=0.1, 
                 attn_dropout=0.1,
                 attn_bias=False,
                 causal=False, 
                 prenorm=False,
                 shared_qk:bool=False,
                 final_norm=None):
        store_attr('d_model')
        self.layers = nn.ModuleList([])    
        for _ in range(n_layers):
            self.layers.append(TransformerEncoderBlockNLN(d_model, n_heads, causal=causal, 
                                    d_ff=d_ff, attn_dropout=attn_dropout, ff_dropout=ff_dropout, 
                                    prenorm=prenorm, attn_bias=attn_bias, shared_qk=shared_qk))
        self.norm = None if final_norm is None else final_norm(d_model)
        
    def forward(self, x, mask=None):
        for layer in self.layers: x = layer(x, mask=mask)
        if self.norm is not None: x = self.norm(x)
        return x

In [None]:
x = torch.randn(bs, sl, d)
m = TransformerEncoderNLN(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

In [None]:
#hide
m

TransformerEncoderNLN(
  (layers): ModuleList(
    (0): TransformerEncoderBlockNLN(
      (attn): Residual(
        (sublayer): Attention(
          (in_proj): AttnInProjV2(
            (to_q): Linear(in_features=64, out_features=64, bias=False)
            (to_kv): Linear(in_features=64, out_features=128, bias=False)
          )
          (attn): ScaledDotProdAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (out_proj): Linear(in_features=64, out_features=64, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (ff): Residual(
        (sublayer): FeedForward(
          (net): Sequential(
            (0): Linear(in_features=64, out_features=256, bias=True)
            (1): GELU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=256, out_features=64, bias=True)
            (4): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (1): TransformerEncoderBlockNLN(
     

In [None]:
#export
class TransformerLMNLN(Module, LMMixin):
    """
    tmp
    Basic Transformer for language modelling
    
    Parameters:
        * vocab_sz: int
        * d_model: int - inner dimension of the model
        * n_layers: int (default: 6) 
        * n_heads: int (default: 8)
        * d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
        * attn_dropout: float - attention dropout
        * ff_dropout: float - feed-forward dropout
        * emb_dropout: float - embedding dropout
        * causal: bool (default: True) - if True does causal masking automatically
        * max_seq_len: int (default: 512)
        * tie_weights: bool - if True target embedding weights are used for computation output projection
        * prenorm: bool - wether to use PreNorm or PostNorm
        * attn_bias: bool - wether to allow biases in attention projection layers
        * pad_idx: int - padding token id, required for autogeneration of padding mask
        * pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
        * axial_shape: tuple - [optional] should be factors of max_seq_len
        * axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model
    Inputs:
        * x - input ids, shape [bs, sl]
        * mask - optional boolean mask, shape [bs, sl]
    Returns:
        * logits - target token logits, shape [bs, sl, vocab_sz]
    """
    def __init__(self, 
                 vocab_sz:int, 
                 d_model:int, 
                 n_layers:int=6,
                 n_heads:int=8,
                 d_ff:int=None,
                 attn_dropout:float=0.1,
                 ff_dropout:float=0.1,
                 emb_dropout:float=0.1,
                 tie_weights:bool=True,
                 causal:bool=True,
                 pos_enc:str='absolute',
                 max_seq_len:int=512,
                 axial_shape:tuple=None,
                 axial_emb_dims:tuple=None,
                 pad_idx:int=None,
                 prenorm:bool=False,
                 attn_bias:bool=False,
                 shared_qk:bool=False):
        store_attr()
        self.emb = TransformerEmbedding(vocab_sz, d_model, max_seq_len, dropout=emb_dropout, 
                                        pos_enc=pos_enc, axial_shape=axial_shape, 
                                        axial_emb_dims=axial_emb_dims)
        final_norm = None
        self.encoder = TransformerEncoderNLN(d_model, n_layers, n_heads, causal=causal, d_ff=d_ff,
                                          attn_dropout=attn_dropout, ff_dropout=ff_dropout,
                                          prenorm=prenorm, attn_bias=attn_bias,
                                          shared_qk=shared_qk, final_norm=final_norm)
        self.proj = nn.Linear(d_model, vocab_sz)
        if tie_weights: self.proj.weight = self.emb.emb.weight
        
    def forward(self, x, mask=None):
        x = self.emb(x)
        x = self.encoder(x, mask=mask)
        return self.proj(x)

In [None]:
bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = TransformerLMNLN(vocab_sz, d, n_layers=2, causal=True)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))

### init function

In [None]:
#export
def fixup_init(model):
    "Applies FixUp initialization to LM (proto ver)"
    n_blocks = len(model.encoder.layers)*2
    for l in model.encoder.layers:
        l.attn.sublayer.in_proj.to_q.weight.data *= n_blocks**(-1/2)
        l.attn.sublayer.in_proj.to_kv.weight.data *= n_blocks**(-1/4)
        l.attn.sublayer.out_proj.weight.data *= 0.

        l.ff.sublayer.net[0].weight.data *= n_blocks**-0.5
        l.ff.sublayer.net[0].bias.data.zero_()
        l.ff.sublayer.net[3].weight.data.zero_()
        l.ff.sublayer.net[3].bias.data.zero_()

    model.proj.weight.data.zero_()
    model.proj.bias.data.zero_()

### Scales and Shifts

In [None]:
#export
class Scale(Module):
    def  __init__(self, scale=1.):
        self.scale = torch.nn.Parameter(torch.ones(1)*scale)
    def forward(self, x):
        return x * self.scale

class Shift(Module):
    def __init__(self):
        self.bias = torch.nn.Parameter(torch.zeros(1))
    def forward(self, x):
        return x + self.bias

class ShiftScale(Module):
    def __init__(self, sublayer, scale=1.):
        self.sublayer = sublayer
        self.shift = Shift()
        self.scale = Scale()
    def forward(self, x, **kwargs):
        x = self.shift(x)
        x = self.sublayer(x, **kwargs)
        return self.scale(x)


In [None]:
class FeedForwardFixup(Module):
    """
    FeedForward with shifts and scale for FixUp
    """
    def __init__(self, d_model:int, d_ff:int=None, dropout:float=0.):
        d_ff = default(d_ff, 4 * d_model)
        layers = OrderedDict(
            [('shift1',Shift()),
            ('fc1',nn.Linear(d_model, d_ff)),
            ('shift2',Shift()),
            ('act',nn.GELU()),
            ('dropout1',nn.Dropout(dropout)),
            ('shift3',Shift()),
            ('fc2',nn.Linear(d_ff, d_model)),
            ('dropout2',nn.Dropout(dropout)),
            ('scale',Scale())])
        self.net = nn.Sequential(layers)
        self._init()

    def forward(self, x):
        return self.net(x)

    def _init(self):
        [nn.init.xavier_uniform_(p) for p in self.parameters() if p.dim() > 1]

In [None]:
#export
class TransformerEncoderBlockNLN2(Module):
    """
    tmp
    Bacis transformer encoder block. Consists of multi-head attention and positional 
    feedforward layers
    """
    def __init__(self,
                 d_model:int, 
                 n_heads:int = 8, 
                 d_ff:int = None, 
                 attn_dropout:float = 0.1,
                 ff_dropout:float = 0.1,
                 causal:bool = False, 
                 attn_bias:bool = False,
                 prenorm:bool=False,
                 shared_qk:bool=False):
        store_attr('attn_dropout') # mb separate argument attn_post_dropout
        self.attn = Residual(ShiftScale(Attention(d_model, n_heads=n_heads, causal=causal, dropout=attn_dropout, bias=attn_bias, shared_qk=shared_qk)))
        self.ff = Residual(FeedForwardFixup(d_model, d_ff=d_ff, dropout=ff_dropout))

    def forward(self, x, mask=None):
        out = self.attn(x, mask=mask)
        return self.ff(out)


In [None]:
bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockNLN2(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

In [None]:
#export
class TransformerEncoderNLN2(Module):
    """Stack of TransformerEncoderBlocks"""
    def __init__(self, 
                 d_model, 
                 n_layers=6, 
                 n_heads=8, 
                 d_ff=None,
                 ff_dropout=0.1, 
                 attn_dropout=0.1,
                 attn_bias=False,
                 causal=False, 
                 prenorm=False,
                 shared_qk:bool=False,
                 final_norm=None):
        store_attr('d_model')
        self.layers = nn.ModuleList([])    
        for _ in range(n_layers):
            self.layers.append(TransformerEncoderBlockNLN2(d_model, n_heads, causal=causal, 
                                    d_ff=d_ff, attn_dropout=attn_dropout, ff_dropout=ff_dropout, 
                                    prenorm=prenorm, attn_bias=attn_bias, shared_qk=shared_qk))
        self.norm = None if final_norm is None else final_norm(d_model)
        
    def forward(self, x, mask=None):
        for layer in self.layers: x = layer(x, mask=mask)
        if self.norm is not None: x = self.norm(x)
        return x

In [None]:
x = torch.randn(bs, sl, d)
m = TransformerEncoderNLN2(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

In [None]:
#hide
m

TransformerEncoderNLN2(
  (layers): ModuleList(
    (0): TransformerEncoderBlockNLN2(
      (attn): Residual(
        (sublayer): ShiftScale(
          (sublayer): Attention(
            (in_proj): AttnInProjV2(
              (to_q): Linear(in_features=64, out_features=64, bias=False)
              (to_kv): Linear(in_features=64, out_features=128, bias=False)
            )
            (attn): ScaledDotProdAttention(
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (out_proj): Linear(in_features=64, out_features=64, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (shift): Shift()
          (scale): Scale()
        )
      )
      (ff): Residual(
        (sublayer): FeedForwardFixup(
          (net): Sequential(
            (shift1): Shift()
            (fc1): Linear(in_features=64, out_features=256, bias=True)
            (shift2): Shift()
            (act): GELU()
            (dropout1): Dropout(p=0.1, inplace=False

In [None]:
#export
class TransformerLMNLN2(Module, LMMixin):
    """
    tmp
    Basic Transformer for language modelling
    
    Parameters:
        * vocab_sz: int
        * d_model: int - inner dimension of the model
        * n_layers: int (default: 6) 
        * n_heads: int (default: 8)
        * d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
        * attn_dropout: float - attention dropout
        * ff_dropout: float - feed-forward dropout
        * emb_dropout: float - embedding dropout
        * causal: bool (default: True) - if True does causal masking automatically
        * max_seq_len: int (default: 512)
        * tie_weights: bool - if True target embedding weights are used for computation output projection
        * prenorm: bool - wether to use PreNorm or PostNorm
        * attn_bias: bool - wether to allow biases in attention projection layers
        * pad_idx: int - padding token id, required for autogeneration of padding mask
        * pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
        * axial_shape: tuple - [optional] should be factors of max_seq_len
        * axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model
    Inputs:
        * x - input ids, shape [bs, sl]
        * mask - optional boolean mask, shape [bs, sl]
    Returns:
        * logits - target token logits, shape [bs, sl, vocab_sz]
    """
    def __init__(self, 
                 vocab_sz:int, 
                 d_model:int, 
                 n_layers:int=6,
                 n_heads:int=8,
                 d_ff:int=None,
                 attn_dropout:float=0.1,
                 ff_dropout:float=0.1,
                 emb_dropout:float=0.1,
                 tie_weights:bool=True,
                 causal:bool=True,
                 pos_enc:str='absolute',
                 max_seq_len:int=512,
                 axial_shape:tuple=None,
                 axial_emb_dims:tuple=None,
                 pad_idx:int=None,
                 prenorm:bool=False,
                 attn_bias:bool=False,
                 shared_qk:bool=False):
        store_attr()
        self.emb = TransformerEmbedding(vocab_sz, d_model, max_seq_len, dropout=emb_dropout, 
                                        pos_enc=pos_enc, axial_shape=axial_shape, 
                                        axial_emb_dims=axial_emb_dims)
        final_norm = None
        self.encoder = TransformerEncoderNLN2(d_model, n_layers, n_heads, causal=causal, d_ff=d_ff,
                                          attn_dropout=attn_dropout, ff_dropout=ff_dropout,
                                          prenorm=prenorm, attn_bias=attn_bias,
                                          shared_qk=shared_qk, final_norm=final_norm)
        self.proj = nn.Linear(d_model, vocab_sz)
        if tie_weights: self.proj.weight = self.emb.emb.weight
        
    def forward(self, x, mask=None):
        x = self.emb(x)
        x = self.encoder(x, mask=mask)
        return self.proj(x)

In [None]:
bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = TransformerLMNLN2(vocab_sz, d, n_layers=2, causal=True)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))

In [None]:
#export
def fixup_init2(model):
    "Applies FixUp initialization to LM (proto ver)"
    n_blocks = len(model.encoder.layers)*2
    for l in model.encoder.layers:
        #?? is -1/6 right or should be -1/2;-1/4
        l.attn.sublayer.sublayer.in_proj.to_q.weight.data *= n_blocks**(-1/6)
        l.attn.sublayer.sublayer.in_proj.to_kv.weight.data *= n_blocks**(-1/6)
        l.attn.sublayer.sublayer.out_proj.weight.data *= 0.

        l.ff.sublayer.net.fc1.weight.data *= n_blocks**-0.5
        l.ff.sublayer.net.fc1.bias.data.zero_()
        l.ff.sublayer.net.fc2.weight.data.zero_()
        l.ff.sublayer.net.fc2.bias.data.zero_()

    model.proj.weight.data.zero_()
    model.proj.bias.data.zero_()

## ADMIN init

### setup

In [None]:
#export
class AdminResidual(Module):
    def __init__(self, sublayer, d_model):
        self.sublayer = sublayer
        self.w = torch.nn.Parameter(torch.ones(d_model))
    def forward(self, x, *args, **kwargs):
        return x*self.w + self.sublayer(x, *args, **kwargs)


In [None]:
#export
class TransformerEncoderBlockAdmin(Module):
    """
    Bacis transformer encoder block. Consists of multi-head attention and positional 
    feedforward layers
    """
    def __init__(self,
                 d_model:int, 
                 n_heads:int = 8, 
                 d_ff:int = None, 
                 attn_dropout:float = 0.1,
                 ff_dropout:float = 0.1,
                 causal:bool = False, 
                 attn_bias:bool = False, 
                 prenorm:bool=False,
                 shared_qk:bool=False):
        store_attr('attn_dropout') # mb separate argument attn_post_dropout
        
        self.attn = PostNorm(d_model, AdminResidual(Attention(d_model, n_heads=n_heads, causal=causal, dropout=attn_dropout, bias=attn_bias, shared_qk=shared_qk), d_model))
        self.ff = PostNorm(d_model, AdminResidual(FeedForward(d_model, d_ff=d_ff, dropout=ff_dropout), d_model))
        
    def forward(self, x, mask=None):
        out = self.attn(x, mask=mask)
        return self.ff(out)

In [None]:
bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockAdmin(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

In [None]:
#export
class TransformerEncoderAdmin(Module):
    """Stack of TransformerEncoderBlocks"""
    def __init__(self, 
                 d_model, 
                 n_layers=6, 
                 n_heads=8, 
                 d_ff=None,
                 ff_dropout=0.1, 
                 attn_dropout=0.1,
                 attn_bias=False,
                 causal=False, 
                 prenorm=False,
                 shared_qk:bool=False,
                 final_norm=None):
        store_attr('d_model')
        self.layers = nn.ModuleList([])    
        for _ in range(n_layers):
            self.layers.append(TransformerEncoderBlockAdmin(d_model, n_heads, causal=causal, 
                                    d_ff=d_ff, attn_dropout=attn_dropout, ff_dropout=ff_dropout, 
                                    prenorm=prenorm, attn_bias=attn_bias, shared_qk=shared_qk))
        self.norm = None if final_norm is None else final_norm(d_model)
        
    def forward(self, x, mask=None):
        for layer in self.layers: x = layer(x, mask=mask)
        if self.norm is not None: x = self.norm(x)
        return x

In [None]:
#export
class TransformerLMAdmin(Module, LMMixin):
    """
    tmp
    Basic Transformer for language modelling
    
    Parameters:
        * vocab_sz: int
        * d_model: int - inner dimension of the model
        * n_layers: int (default: 6) 
        * n_heads: int (default: 8)
        * d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
        * attn_dropout: float - attention dropout
        * ff_dropout: float - feed-forward dropout
        * emb_dropout: float - embedding dropout
        * causal: bool (default: True) - if True does causal masking automatically
        * max_seq_len: int (default: 512)
        * tie_weights: bool - if True target embedding weights are used for computation output projection
        * prenorm: bool - wether to use PreNorm or PostNorm
        * attn_bias: bool - wether to allow biases in attention projection layers
        * pad_idx: int - padding token id, required for autogeneration of padding mask
        * pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
        * axial_shape: tuple - [optional] should be factors of max_seq_len
        * axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model
    Inputs:
        * x - input ids, shape [bs, sl]
        * mask - optional boolean mask, shape [bs, sl]
    Returns:
        * logits - target token logits, shape [bs, sl, vocab_sz]
    """
    def __init__(self, 
                 vocab_sz:int, 
                 d_model:int, 
                 n_layers:int=6,
                 n_heads:int=8,
                 d_ff:int=None,
                 attn_dropout:float=0.1,
                 ff_dropout:float=0.1,
                 emb_dropout:float=0.1,
                 tie_weights:bool=True,
                 causal:bool=True,
                 pos_enc:str='absolute',
                 max_seq_len:int=512,
                 axial_shape:tuple=None,
                 axial_emb_dims:tuple=None,
                 pad_idx:int=None,
                 prenorm:bool=False,
                 attn_bias:bool=False,
                 shared_qk:bool=False):
        store_attr()
        self.emb = TransformerEmbedding(vocab_sz, d_model, max_seq_len, dropout=emb_dropout, 
                                        pos_enc=pos_enc, axial_shape=axial_shape, 
                                        axial_emb_dims=axial_emb_dims)
        final_norm = None
        self.encoder = TransformerEncoderAdmin(d_model, n_layers, n_heads, causal=causal, d_ff=d_ff,
                                          attn_dropout=attn_dropout, ff_dropout=ff_dropout,
                                          prenorm=prenorm, attn_bias=attn_bias,
                                          shared_qk=shared_qk, final_norm=final_norm)
        self.proj = nn.Linear(d_model, vocab_sz)
        if tie_weights: self.proj.weight = self.emb.emb.weight
        
    def forward(self, x, mask=None):
        x = self.emb(x)
        x = self.encoder(x, mask=mask)
        return self.proj(x)


### profiling

In [None]:
#export
class BreakFitCallback(Callback):
    order=-1
    "Cancels fit after one batch before weight update"
    def before_step(self):
        self.model.zero_grad(set_to_none=True)
        raise CancelStepException
    def after_step(self):
        raise CancelBatchException
    def after_batch(self):
        print('Fit canceled')
        raise CancelFitException

In [None]:
#export
def res_submodules(model):
    return [m.sublayer for m in learn.model.modules() if isinstance(m, AdminResidual)]

def res_modules(model):
    return [m for m in learn.model.modules() if isinstance(m, AdminResidual)]

In [None]:
#skip
#...
# config = CharLMConfig(d_model=512, n_layers=6, max_seq_len=512,
#                       pad_idx=pad_id)

# learn = Learner(dls, TransformerLMAdmin.from_config(config),
#                 loss_func=CrossEntropyLossFlat(ignore_index=pad_id),
#                 cbs = [GradientClip(1.0),
#                        SaveModelCallback(with_opt=True)],
#                 metrics=[accuracy, perplexity, bpc]).to_fp16()
# learn.add_cb(ActivationStats(modules=res_submodules(learn.model)))
# len(learn.activation_stats.modules)

In [None]:
# with learn.added_cbs(BreakFitCallback()), learn.removed_cbs(SaveModelCallback):
#     learn.fit(1, 1e-3)

In [None]:
# learn.activation_stats.stats[0]

In [None]:
#export
def variances(learn):
    return np.array([stat['std']**2 for stat in learn.activation_stats.stats[0]])

# variances(learn)

In [None]:
#export
def _init_scales(vars):
    return np.sqrt(np.cumsum(vars))
# scales = _init_scales(variances(learn))
# scales

### initialization

In [None]:
#export
def admin_init(model, scales):
    ms = res_modules(model)
    for m, s in zip(ms, scales):
        m.w.data *= s

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_layers.ipynb.
Converted 02_attention.ipynb.
Converted 03_transformer.ipynb.
Converted 04_reformer.ipynb.
Converted 05_tokenizers.ipynb.
Converted 06_data.ipynb.
Converted 07_metrics.ipynb.
Converted 08_optimizers.ipynb.
Converted 09_tracking.ipynb.
Converted 10_config.ipynb.
Converted 40_experimental.ipynb.
Converted index.ipynb.
