In [1]:
import gym as gym




In [2]:
env = gym.make("CartPole-v1")
obs = env.reset()
obs

array([ 0.02535299,  0.04150639, -0.00878413,  0.00702753], dtype=float32)

In [3]:
env.render()

True

In [4]:
env.action_space

Discrete(2)

In [5]:
action = 1
obs, reward, done, info = env.step(action)
obs

array([ 0.02618312,  0.23675321, -0.00864358, -0.28841388], dtype=float32)

In [6]:
reward

1.0

In [7]:
done

False

In [8]:
info

{}

In [9]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals =[]
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(200):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [10]:
import numpy as np

In [11]:
np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

(42.368, 8.94810460376945, 24.0, 68.0)

In [41]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [42]:
n_inputs=4 # == env.observation_space.shape[0]

model = Sequential([
    Dense(5, activation ="elu", input_shape = [n_inputs]),
    Dense(1, activation ="sigmoid")
])

In [43]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1,1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, info = env.step(int(action[0,0].numpy()))
    return obs, reward, done, grads

In [44]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads =[]
    for episode in range(n_episodes):
        current_rewards=[]
        current_grads=[]
        obs=env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_grads)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [96]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards)-2, -1, -1):
        discounted[step] += discounted[step+1] * discount_factor
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards-reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]


In [97]:
discount_rewards([10,0,-50],discount_factor=0.8)

array([-22, -40, -50])

In [98]:
discount_and_normalize_rewards([[10,0, -50], [10,20]], discount_factor = 0.80)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [99]:
n_iter = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95

In [100]:
optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.binary_crossentropy

In [101]:
for iter in range(n_iter):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)
    
    all_mean_grads =[]
    for var_index in range(len(model.trainable_variables)):
        means_grads = tf.reduce_mean(
        [final_reward * all_grads[episode_index][step][var_index]
         for episode_index, final_rwards in enumerate(all_final_rewards)
            for step, final_reward in enumerate(final_rewards)], axis =0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

InvalidArgumentError: Incompatible shapes: [4,5] vs. [5,1] [Op:AddV2]

In [102]:
env = gym.make("CartPole-v0")
input_shape=[4]
n_outputs= 2

model = Sequential([
    Dense(32, activation="elu", input_shape=input_shape),
    Dense(32, activation="elu"),
    Dense(n_outputs)
])

  logger.warn(


In [103]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand()<epsilon:
        return np.random.randint(2)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [104]:
from collections import deque
replay_buffer = deque(maxlen=2000)

In [105]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)
    ]
    return states, actions, rewards, next_states, dones

In [107]:
def play_one_step(env, state, epsilon):
    action= epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_buffer.append((state,action,reward,next_state,done))
    return next_state,reward,done,info

In [108]:
batch_size=32
discount_factor=0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

In [111]:
def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1-dones) * discount_factor * max_next_Q_values)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [112]:
for episode in range(600):
    obs = env.reset()
    for step in range(200):
        epsilon = max(1-episode/500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    if episode>50:
        training_step(batch_size)