In [93]:
import numpy as np

In [5]:
# We don't actually need the function, we just need its gradient
def grad(t, C=15):
    if x % 3 == 1:
        return C
    else:
        return -1

In [39]:
# We look at value in [-1, 1], because the function can always be optimized better
def project(x):
    if -1 <= x <= 1:
        return x
    elif x <= -1:
        return -1
    else:
        return 1

In [43]:
num_iterations = 10000
C = 15

## Stochastic gradient descent 
### converges to optimal value of -1

In [72]:
x = 0
lr = 0.0001
for t in range(num_iterations):
    g_t = grad(t, C)
    x = project(x - lr * g_t)

In [73]:
x

-0.9998

## Adam
### With specific values of beta_1 and beta_2
### converges to sub-optimal point of 1

Note, that with default values beta_1 = 0.9 and beta_2 = 0.999 Adam converges to -1 for this exact example with C = 15.
Large values of epsilon (>100) also gets Adam to converge to optimal point -1

In [102]:
m_t = 0
v_t = 0
epsilon = 1e-7
x = 0
beta_1 = 0.0
beta_2 = 1 / (1 + C * C )
step_size = 0.1
v_hat = 0

In [100]:
for t in range(num_iterations):
    
    g_t = grad(t, C)
    m_t = beta_1 * m_t + (1 - beta_1) * g_t
    v_t = beta_2 * v_t + (1 - beta_2) * (g_t * g_t)
    m_hat = m_t / (1 - np.power(beta_1, t + 1))
    v_hat = v_t / (1 - np.power(beta_2, t + 1))
    
    x = project(x - step_size * m_hat / (v_hat + epsilon))

In [101]:
x

1

## AMSGRAD converges to optimal point
## without the need to modify hyper-parameters

In [108]:
m_t = 0
v_t = 0
epsilon = 1e-7
x = 0
beta_1 = 0.0
beta_2 = 1 / (1 + C * C )
step_size = 0.1
v_hat = 0

In [109]:
for t in range(num_iterations):
    
    g_t = grad(t, C)
    m_t = beta_1 * m_t + (1 - beta_1) * g_t
    v_t = beta_2 * v_t + (1 - beta_2) * (g_t * g_t)
    v_hat = np.maximum(v_hat, v_t)
    
    x = project(x - step_size * m_t / (v_hat + epsilon))

In [110]:
x

-0.9991071782072143