In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.optimizers import Optimizer
gpu = tf.config.experimental.get_visible_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, enable = True)

In [2]:
tf.__version__

'2.2.0'

In [3]:
import tensorflow_probability as tfp

In [4]:
class AdamCustom(Optimizer):
    def __init__(self, learning_rate = 0.001, beta1= 0.9, beta2= 0.999, use_locking = False, 
                 name = 'Adam_Custom', decay = False, **kwargs):
        super(AdamCustom, self).__init__(name, **kwargs)
        self._name = name
        self._lr = learning_rate
        
        self._beta1 = tf.constant(beta1, dtype = tf.float32)
        self._beta2 = tf.constant(beta2, dtype = tf.float32)
        # tensor versions of the constructor arguments, created in _prepare
        self._lr_t = None
        self._beta1_t = tf.Variable(0, dtype = tf.float32, trainable = False)
        self._beta2_t = tf.Variable(0, dtype = tf.float32, trainable = False)
        self._decay = decay
    def _create_slots(self, var_list):
        # create slots for the first and second moments
        for v in var_list:
            self.add_slot(v, slot_name = 'v', initializer = 'zeros')
            self.add_slot(v, slot_name = 'm', initializer = 'zeros')
    def _resource_apply_dense(self, grad, var, **kw_args):
        var_device, var_dtype = var.device,var.dtype.base_dtype
        lr_t = tf.cast(self._lr, var_dtype)
        beta1 = tf.cast(self._beta1, var_dtype)
        beta2 = tf.cast(self._beta2, var_dtype)
        beta1_t = tf.cast(self._beta1_t, var_dtype)
        beta2_t = tf.cast(self._beta2_t, var_dtype)
        eps = 1e-7
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        m_t = m.assign(beta2 * m + (1- beta2) * grad**2)
        v_t = v.assign(beta1 * v + (1 -beta1) * grad)
        m_refined = m_t /(1 - beta2_t)
        v_refined = v_t /(1 - beta1_t)
        # decay learning rate
        if self._decay:
            step = tf.cast(self.iterations, dtype = tf.float32)
            lr_t = lr_t/tf.sqrt(step+1)
        var_delta = lr_t * v_refined / (tf.sqrt(m_refined) + eps)
        var_update = var.assign_sub(var_delta)
        #print(var_delta.numpy(), m_t.numpy(), v_t.numpy(), m.numpy(), v.numpy())
        self._beta1_t.assign(self._beta1_t * self._beta1)
        self._beta2_t.assign(self._beta2_t * self._beta2)
        return var_update
    def _resource_apply_sparse(self, grad, var):
        raise NotImplementedError('Sparse gradient updates are not supported')   
    def get_config(self, ):
        config = {'name': self._name}
        if self.clipnorm is not None:
            config['clipnorm'] = self.clipnorm
        if self.clipvalur is not None:
            config['clipvalue'] = self.clipvalue
        return config


In [18]:
class AMSCustom(Optimizer):
    def __init__(self, learning_rate = 0.001, beta1= 0.9, beta2= 0.999, use_locking = False, 
                 name = 'Adam_Custom', decay = False, **kwargs):
        super(AMSCustom, self).__init__(name, **kwargs)
        self._name = name
        self._lr = learning_rate
        
        self._beta1 = tf.constant(beta1, dtype = tf.float32)
        self._beta2 = tf.constant(beta2, dtype = tf.float32)
        # tensor versions of the constructor arguments, created in _prepare
        self._lr_t = None
        self._beta1_t = tf.Variable(0, dtype = tf.float32, trainable = False)
        self._beta2_t = tf.Variable(0, dtype = tf.float32, trainable = False)
        self._decay = decay
    def _create_slots(self, var_list):
        # create slots for the first and second moments
        for v in var_list:
            self.add_slot(v, slot_name = 'v', initializer = 'zeros')
            self.add_slot(v, slot_name = 'm', initializer = 'zeros')
            self.add_slot(v, slot_name = 'v_hat')
    def _resource_apply_dense(self, grad, var, **kw_args):
        var_device, var_dtype = var.device,var.dtype.base_dtype
        lr_t = tf.cast(self._lr, var_dtype)
        if self._decay:
            step = tf.cast(self.iterations, dtype = tf.float32)
            lr_t = lr_t/tf.sqrt(step+1)
        beta1 = tf.cast(self._beta1, var_dtype)
        beta2 = tf.cast(self._beta2, var_dtype)
        beta1_t = tf.cast(self._beta1_t, var_dtype)
        beta2_t = tf.cast(self._beta2_t, var_dtype)
        eps = 1e-7
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        v_hat = self.get_slot(var, 'v_hat')
        m_t = m.assign(beta2 * m + (1- beta2) * grad**2)
        v_t = v.assign(beta1 * v + (1 -beta1) * grad)
        m_refined = m_t /(1 - beta2_t)
        v_refined = v_t /(1 - beta1_t)
        # decay learning rate
        m_refined = v_hat.assign(tf.maximum(m_refined, v_hat))
        var_delta = lr_t * v_refined / (tf.sqrt(m_refined) + eps)
        var_update = var.assign_sub(var_delta)
        #print(var_delta.numpy(), m_t.numpy(), v_t.numpy(), m.numpy(), v.numpy())
        self._beta1_t.assign(self._beta1_t * self._beta1)
        self._beta2_t.assign(self._beta2_t * self._beta2)
        return var_update
    def _resource_apply_sparse(self, grad, var):
        raise NotImplementedError('Sparse gradient updates are not supported')   
    def get_config(self, ):
        config = {'name': self._name}
        if self.clipnorm is not None:
            config['clipnorm'] = self.clipnorm
        if self.clipvalur is not None:
            config['clipvalue'] = self.clipvalue
        return config


In [24]:
bernoulli = tfp.distributions.Bernoulli(probs = 0.01, dtype = tf.float32)
def test_optimizer(optimizer_name, iterations = 500000, learning_rate = 0.0001,decay = True):
    x = tf.Variable(0, dtype = tf.float32)
    r = bernoulli.sample()
    if optimizer_name == 'adam':
        optimizer = AdamCustom(learning_rate = 0.001, decay = decay)
    elif optimizer_name == 'ams':
        optimizer = AMSCustom(learning_rate = 0.001, decay = decay)
    results = []
    quan = iterations //10
    for i in range(iterations):
        with tf.GradientTape() as tape:
            tape.watch(x)
            loss = (1010 *r -10*(1-r))*x
        gradient = tape.gradient(loss, x)
        optimizer.apply_gradients([(gradient, x)])
        if  i % quan ==0:
            print('current x,:{}, loss:{}'.format(x.numpy(), loss.numpy()))
        results.append(x.numpy())
        x.assign(tf.clip_by_value(x, -1, 1))
    return results
    

In [25]:
ret1 = test_optimizer('adam',decay = True)

current x,:0.0031622983515262604, loss:-0.0
current x,:0.5610617399215698, loss:-5.610572814941406
current x,:0.7462995648384094, loss:-7.462964057922363
current x,:0.8884552717208862, loss:-8.884527206420898
current x,:1.0000022649765015, loss:-10.0
current x,:1.0000020265579224, loss:-10.0
current x,:1.0000017881393433, loss:-10.0
current x,:1.0000016689300537, loss:-10.0
current x,:1.0000015497207642, loss:-10.0
current x,:1.0000015497207642, loss:-10.0


In [26]:
ret2 = test_optimizer('ams',decay = True)

current x,:0.0031622983515262604, loss:-0.0
current x,:0.5610617399215698, loss:-5.610572814941406
current x,:0.7462995648384094, loss:-7.462964057922363
current x,:0.8884552717208862, loss:-8.884527206420898
current x,:1.0000022649765015, loss:-10.0
current x,:1.0000020265579224, loss:-10.0
current x,:1.0000017881393433, loss:-10.0
current x,:1.0000016689300537, loss:-10.0
current x,:1.0000015497207642, loss:-10.0
current x,:1.0000015497207642, loss:-10.0


In [19]:
optimizer = AMSCustom(learning_rate = 0.1, decay = True)
x = tf.Variable(100, trainable = True,dtype = tf.float32)
y = tf.Variable(10, trainable = True, dtype = tf.float32)
for i in range(100):
    with tf.GradientTape() as tape:
        tape.watch([x,y])
        loss = x**2 - 2*x + 10 + y**2
    gradient = tape.gradient(loss, [x,y])
    _ = optimizer.apply_gradients(zip(gradient, [x, y]))
    print(loss.numpy(), x.numpy(), y.numpy())

9910.001 99.68377 9.68377
9841.262 99.3833 9.383567
9776.325 99.09756 9.09849
9714.912 98.825584 8.827688
9656.771 98.56649 8.570356
9601.672 98.31945 8.325733
9549.394 98.083694 8.093101
9499.742 97.85851 7.8717804
9452.537 97.64325 7.6611323
9407.611 97.43729 7.460552
9364.81 97.24006 7.2694707
9323.993 97.05103 7.087351
9285.031 96.86971 6.9136868
9247.802 96.69564 6.748002
9212.19 96.5284 6.589848
9178.101 96.36758 6.438802
9145.433 96.21281 6.294465
9114.099 96.06375 6.156464
9084.019 95.92008 6.024446
9055.115 95.7815 5.8980794
9027.319 95.64773 5.777053
9000.566 95.5185 5.6610723
8974.795 95.39357 5.549862
8949.946 95.27271 5.4431624
8925.973 95.155716 5.3407297
8902.822 95.04237 5.242334
8880.45 94.9325 5.1477594
8858.813 94.82593 5.0568027
8837.876 94.72248 4.969273
8817.596 94.62201 4.884991
8797.943 94.52437 4.8037877
8778.884 94.42943 4.7255034
8760.389 94.33705 4.649989
8742.428 94.24712 4.5771036
8724.977 94.15953 4.506715
8708.009 94.074165 4.4386973
8691.503 93.99093 4.

In [8]:
x = tf.Variable(100, trainable = True,dtype = tf.float32)
y = tf.Variable(10, trainable = True, dtype = tf.float32)
optimizer1 = tf.keras.optimizers.Adam(learning_rate = 1)
for i in range(100):
    with tf.GradientTape() as tape:
        tape.watch([x,y])
        loss = x**2 - 2*x + 10 - 1/(y**2 + 10)
    gradient = tape.gradient(loss, [x,y])
    _ = optimizer1.apply_gradients(zip(gradient, [x, y]))
    print(loss.numpy(), x.numpy(), y.numpy())

9809.992 98.99997 9.00188
9612.983 98.00024 8.005105
9418.032 97.00098 7.0124235
9225.172 96.00237 6.0268927
9034.43 95.004616 5.0515857
8845.84 94.0079 4.088662
8659.432 93.01241 3.1377559
8475.234 92.01836 2.1934059
8293.273 91.02592 1.2416033
8113.5796 90.035286 0.26432884
7936.183 89.04666 -0.67231077
7761.118 88.06023 -1.3123233
7588.3975 87.07617 -1.6103289
7418.0273 86.09468 -1.6441551
7250.026 85.11594 -1.4788241
7084.408 84.14012 -1.1618159
6921.192 83.167404 -0.7347004
6760.387 82.19797 -0.24562538
6602.01 81.23198 0.2383686
6446.071 80.2696 0.6370436
6292.5737 79.311005 0.89363766
6141.5205 78.35634 0.9932953
5992.9116 77.40576 0.9478514
5846.749 76.45943 0.7807756
5703.0312 75.51747 0.52250916
5561.756 74.58004 0.21155402
5422.923 73.64727 -0.10465741
5286.526 72.71929 -0.37578142
5152.558 71.796234 -0.5620746
5021.01 70.87821 -0.6440052
4891.8687 69.96535 -0.6212798
4765.123 69.057755 -0.50749856
4640.7603 68.15555 -0.32635933
4518.768 67.25883 -0.10963862
4399.1323 66.367

In [9]:
import tensorflow as tf
tf.__version__

'2.2.0'

In [10]:
import six
import abc
@six.add_metaclass(abc.ABCMeta)
class Bass():
    @abc.abstractmethod
    def whatever(self,):
        raise NotImplementedError
class SubClass(Bass):
    def __init__(self,):
        super(SubClass, self).__init__()
    def whatever(self,):
        print('whatever')

In [11]:
import tensorflow as tf

In [12]:
a.whatever()

NameError: name 'a' is not defined

In [None]:
class SubClass1(Bass):
    def __init__(self, ):
        super(SubClass1, self).__init__()
    def test(self):
        pass

In [None]:
c = SubClass1()

In [None]:
b._A__name

In [None]:
dir(b)