# Cart Pole with Actor Critic

In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten

In [18]:
def create_model(state_space, action_space, actor_lr, critic_lr):
    x = Input(state_space, name='input_state')
    #x = Flatten(input_shape=(state_space,))(x)
    #x = Dense(state_space, activation="elu", kernel_initializer='he_uniform')(x)
    #x = Dense(state_space, activation="relu")(x)
    
    # actor learns policy
    action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform', name='policy')(x)
    actor = Model(inputs=x, outputs=action, name='actor')
    actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=actor_lr))

    # critic learns value-function
    value = Dense(1, kernel_initializer='he_uniform', name='values')(x)
    critic = Model(inputs=x, outputs=value, name='critic')
    critic.compile(loss='mse', optimizer=RMSprop(lr=critic_lr))
    actor.summary()
    critic.summary()
    return actor, critic

def next_action(actor, state, action_space):
    policy = actor.predict(state, batch_size=1).flatten()
    return np.random.choice(action_space, 1, p=policy)[0]

#### Hyper-parameters

In [4]:
env = gym.make('CartPole-v1')

action_space = env.action_space.n
state_space = env.observation_space.shape[0]
n_episodes = 200
actor_lr = 0.001
critic_lr = 0.005
gamma = 0.99

render = False

#### A2C Algorithm

In [17]:
actor, critic = create_model(state_space, action_space, actor_lr, critic_lr)

ValueError: 'P(s,a)/' is not a valid scope name

In [None]:
scores = []
episodes = []

for e in range(n_episodes):
    done = False
    score = 0
    state = env.reset()
    # https://stackoverflow.com/questions/60911279/is-it-possible-to-get-an-image-of-environment-in-openai-gym
    #state = env.render(mode="rgb_array")
    #state = np.reshape(state, [1, state_space])
    state = tf.convert_to_tensor(state, dtype=tf.float32)
    
    while not done:
        if render:
            env.render()
        action = next_action(actor, state, action_space)
        next_state, reward, done, _ = env.step(action)
        #next_state = np.reshape(next_state, [1, state_space])
        next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)
        # the max score for the cartpole is 499
        reward = reward if not done or score == 499 else -100

        # value iteration and policy gradient
        target = np.zeros((1,1))  # policy targets matrix
        advantages = np.zeros((1,action_space))  # advantages matrix

        value = critic.predict(state)[0]  # get value for this state
        next_value = critic.predict(next_state)[0]  # get value for the next state

        if done:
            advantages[0][action] = reward - value
            target[0][0] = reward
        else:
            advantages[0][action] = reward + gamma * (next_value) - value  # Bellman error
            target[0][0] = reward + gamma * next_value
        
        actor.fit(state, advantages, epochs=1, verbose=0)
        critic.fit(state, target, epochs=1, verbose=0)

        # update scores and state
        score += reward
        state = next_state
        if done:
            score = score if score == 500.0 else score +100
            scores.append(score)
            episodes.append(e)
    
    if e+1 % 50 == 0:
        actor.save_weights("./save_model/cartpole_actor.h5")
        critic.save_weights("./save_model/cartpole_critic.h5")
        print(f"episode: {e+1}/{n_episodes}, score: {score}")

In [None]:
plt.plot(scores)