# MiniBERT + Rowlang

In [1]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt


### 1. Encoder Layers

In [2]:
class EncoderLayer:
    def __init__(self, head):
        pass

### 2. MultiHead Attention

In [139]:
from functools import wraps
from abc import ABC, abstractmethod
from collections import namedtuple


def graph_def(f):
    '''Cache layer (w self.outputs) and autoscope (w self.name)'''
    @wraps(f)
    def wrapper(self, *args, **kwargs):
        if self.outputs is None:
            with tf.variable_scope(self.name):
                self.outputs = f(self, *args, **kwargs)
        return self.outputs
    return wrapper

class Layer(ABC):
    def __init__(self, name):
        self.name = name
        self.outputs = None
        
    @abstractmethod 
    def on(self, *args, **kwargs):
        pass
    
class LinearLayer(Layer):
    def __init__(self, out_dim, name):
        super(LinearLayer, self).__init__(name)
        self.out_dim = out_dim
    
    @graph_def
    def on(self, X):
        return tf.layers.dense(X, self.out_dim, activation=None, name=self.name)

class ScaledDotProdAttentionLayer(Layer):
    def __init__(self, scale, name):
        super(ScaledDotProdAttentionLayer, self).__init__(name)
        self.scale = scale
    @graph_def
    def on(self, Q, K, V):
        '''
        Q: queries [ minibatch x queries x dim_k]
        K: keys    [ minibatch x keys x dim_k]
        V: values  [ minibatch x keys x dim_v]
        '''
        dot = tf.einsum('mqd,mkd->mqk', Q, K, name='dot')            
        scores = tf.nn.softmax(tf.scalar_mul(self.scale, dot), name='scores') 
        A = tf.einsum('mqk,mkd->mqd', scores, V, name='a')
        return A
    
class MultiHeadAttention(Layer):
    def __init__(self, h, d_model, dropout=0.1, name="multihead"):
        '''Implement the multiheaded self attention
        '''
        super(MultiHeadAttention, self).__init__(name)
        self.h = h
        self.d_k = d_model // h
        self.d_model = d_model
        self.attentions = []
        self.heads = []
        Head = namedtuple("Head", ["to_q", "to_k", "to_v"])
        
        for i in range(h):
            q = LinearLayer(self.d_k, "q")
            k = LinearLayer(self.d_k, "k")
            v = LinearLayer(self.d_k, "v")
            self.heads.append(Head(q, k, v))
            
        
        self.A = None
        self.O = None

    @graph_def
    def on(self, X):
        for h in range(self.heads):
            with tf.variable_scope("h{}".format(i)):
                q = h.to_q.on(X)                
                k = h.to_k.on(X)                
                v = h.to_v.on(X)
                scale = 1 / np.sqrt(self.d_k)
                a = ScaledDotProdAttentionLayer(scale, "attn").on(q,k,v)
                self.attentions.append(a)

        self.A = tf.concat(self.attentions, axis=-1, name="A")
        self.O = LinearLayer(self.d_model, "O").on(self.A)
        return self.O
            
                
        

In [140]:
class LayerNormLayer(Layer):
    def __init__(self, name="layernorm"):
        super(LayerNormLayer, self).__init__(name)
        self._eps = 1e-6 # for numerical stability
    
    @graph_def
    def on(self, X):
        """
        X: [minibatch x seq x dims]
        """
        self.mean, self.std = tf.nn.moments(X, axes=-1, keep_dims=True)
        return (X - self.mean)/ (self.std + self._eps)
        
class DropoutLayer(Layer):
    def __init__(self, dropout, name="dropout"):
        super(DropoutLayer, self).__init__(name)
        self.dropout = dropout
    
    @graph_def
    def on(self, X):
        return tf.nn.dropout(X, 1 - self.dropout, name='dropped') 

class EncoderSubLayer(Layer):
    def __init__(self, dropout, sublayer, name, *args, **kwargs):
        super(EncoderSubLayer, self).__init__(name)
        self.dropout = dropout
        self.sublayer = sublayer(*args, **kwargs)
    
    @graph_def
    def on(self, X):
        X_n = LayerNormLayer().on(X)
        return X + DropoutLayer(self.dropout).on(self.sublayer.on(X_n))
    

        
        

In [141]:
tf.reset_default_graph()

DATA_POINTS = 4
SEQ = 3
MODEL_DIM = 5
HEADS = 3
DROPOUT = 0.1
X = tf.placeholder(tf.float32, shape=[None, None, MODEL_DIM])
# with tf.variable_scope("multihead/"):
#     Y = tf.layers.dense(X, 30, use_bias=False)


mha = MultiHeadAttention(HEADS, MODEL_DIM )
ES = EncoderSubLayer(DROPOUT, MultiHeadAttention, "self_attn", 
                     HEADS, MODEL_DIM)

Y = ES.on(X)
Z = ES.sublayer.on(None)


init = tf.group(tf.global_variables_initializer(),
                tf.local_variables_initializer())    
with tf.Session() as sess:
    sess.run(init)
    x_d = np.random.random((DATA_POINTS, SEQ, MODEL_DIM))
    print(x_d)
    y = (sess.run(Y, feed_dict={X:x_d}))
    z = (sess.run(Z, feed_dict={X:x_d}))
    print(z)
    file_writer = tf.summary.FileWriter('./logdir', sess.graph)
    
print(y.shape)
y

TypeError: 'list' object cannot be interpreted as an integer