In [1]:
# Optimizers

# Task 1: Adam Optimizer Overview
# Given parameter ( \theta = 0.5 ), use Adam to adaptively update this parameter. Assume ( m_t = 0, v_t = 0, \beta_1 = 0.9, \beta_2 = 0.999, \epsilon = 10^{-8} ).

# Task 2: Exponential Moving Averages (EMA)
# Calculate EMA over gradients with (\beta_1 = 0.9) for (g_1 = -0.5) and (g_2 = 0.2).
    
# Task 3: Adam Update Rule
# Perform an Adam update for ( t=2 ), where (g_1 = -0.5), (g_2 = 0.2), with given ( \beta_1 ) and ( \beta_2 ).
import numpy as np

# Title: Optimizers

# Task 1: Adam Optimizer Overview
# Given parameter θ = 0.5, use Adam to adaptively update this parameter.
# Assume: m_t = 0, v_t = 0, β_1 = 0.9, β_2 = 0.999, ε = 10^-8

def adam_optimizer(theta, m_t, v_t, g_t, beta_1=0.9, beta_2=0.999, epsilon=1e-8, t=1, learning_rate=0.001):
    # Update biased first moment estimate
    m_t = beta_1 * m_t + (1 - beta_1) * g_t
    
    # Update biased second moment estimate
    v_t = beta_2 * v_t + (1 - beta_2) * (g_t ** 2)
    
    # Correct bias in first moment estimate
    m_hat = m_t / (1 - beta_1 ** t)
    
    # Correct bias in second moment estimate
    v_hat = v_t / (1 - beta_2 ** t)
    
    # Update parameter θ using Adam formula
    theta_new = theta - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
    
    return theta_new, m_t, v_t

# Initial values
theta_initial = 0.5
m_t_initial = 0
v_t_initial = 0
g_t = -0.5  # Gradient at time t
learning_rate = 0.001

# Perform one step of Adam update for t=1
theta_updated, m_t_updated, v_t_updated = adam_optimizer(theta_initial, m_t_initial, v_t_initial, g_t, t=1, learning_rate=learning_rate)
print(f"Updated θ using Adam: {theta_updated}")

# Task 2: Exponential Moving Averages (EMA)
# Calculate EMA over gradients with β_1 = 0.9 for g_1 = -0.5 and g_2 = 0.2

def exponential_moving_average(g1, g2, beta_1=0.9):
    # EMA for two gradients (g_1 and g_2)
    ema_1 = beta_1 * g1 + (1 - beta_1) * g1  # EMA for first gradient
    ema_2 = beta_1 * g2 + (1 - beta_1) * g2  # EMA for second gradient
    
    return ema_1, ema_2

g_1 = -0.5
g_2 = 0.2
ema_1, ema_2 = exponential_moving_average(g_1, g_2, beta_1=0.9)
print(f"EMA for g1: {ema_1}, EMA for g2: {ema_2}")

# Task 3: Adam Update Rule
# Perform an Adam update for t=2, where g_1 = -0.5, g_2 = 0.2 with given β_1 and β_2.

# Perform one more step of Adam update for t=2
theta_updated_t2, m_t_updated_t2, v_t_updated_t2 = adam_optimizer(theta_updated, m_t_updated, v_t_updated, g_t, t=2, learning_rate=learning_rate)
print(f"Updated θ after second Adam step (t=2): {theta_updated_t2}")


Updated θ using Adam: 0.50099999998
EMA for g1: -0.5, EMA for g2: 0.2
Updated θ after second Adam step (t=2): 0.50199999996
