This code will successfully solve Continuous Mountain Car
if the random policy finds the top of the hill reward (+100)
in the first several episodes. Otherwise, it will fall into 
the bad local optimum of stopping at the bottom of the hill.
In this case, re-start until the reward is found. 
This exploration process could be automated but is not
in this version.

In [9]:
import tensorflow as tf
import numpy as np
import gym  #requires OpenAI gym installed
env = gym.envs.make("MountainCarContinuous-v0") 

tf.reset_default_graph()

input_dims = 2
state_placeholder = tf.placeholder(tf.float32, [None, input_dims]) 

def value_function(state):
    n_hidden1 = 400  
    n_hidden2 = 400
    n_outputs = 1
    
    with tf.variable_scope("value_network"):
        init_xavier = tf.contrib.layers.xavier_initializer()
        
        hidden1 = tf.layers.dense(state, n_hidden1, tf.nn.elu, init_xavier)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, tf.nn.elu, init_xavier) 
        V = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
    return V


def policy_network(state):
    n_hidden1 = 40
    n_hidden2 = 40
    n_outputs = 1
    
    with tf.variable_scope("policy_network"):
        init_xavier = tf.contrib.layers.xavier_initializer()
        
        hidden1 = tf.layers.dense(state, n_hidden1, tf.nn.elu, init_xavier)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, tf.nn.elu, init_xavier)
        mu = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
        sigma = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
        sigma = tf.nn.softplus(sigma) + 1e-5
        norm_dist = tf.contrib.distributions.Normal(mu, sigma)
        action_tf_var = tf.squeeze(norm_dist.sample(1), axis=0)
        action_tf_var = tf.clip_by_value(
            action_tf_var, env.action_space.low[0], 
            env.action_space.high[0])
    return action_tf_var, norm_dist

################################################################
#sample from state space for state normalization
import sklearn
import sklearn.preprocessing
                                    
state_space_samples = np.array(
    [env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)

#function to normalize states
def scale_state(state):                 #requires input shape=(2,)
    scaled = scaler.transform([state])
    return scaled                       #returns shape =(1,2)   
###################################################################

lr_actor = 0.00002  #set learning rates
lr_critic = 0.001

# define required placeholders
action_placeholder = tf.placeholder(tf.float32)
delta_placeholder = tf.placeholder(tf.float32)
target_placeholder = tf.placeholder(tf.float32)

action_tf_var, norm_dist = policy_network(state_placeholder)
V = value_function(state_placeholder)

# define actor (policy) loss function
loss_actor = -tf.log(norm_dist.prob(action_placeholder) + 1e-5) * delta_placeholder
training_op_actor = tf.train.AdamOptimizer(
    lr_actor, name='actor_optimizer').minimize(loss_actor)

# define critic (state-value) loss function
loss_critic = tf.reduce_mean(tf.squared_difference(
                             tf.squeeze(V), target_placeholder))
training_op_critic = tf.train.AdamOptimizer(
        lr_critic, name='critic_optimizer').minimize(loss_critic)
################################################################
#Training loop
gamma = 0.99        #discount factor
num_episodes = 300

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    episode_history = []
    for episode in range(num_episodes):
        #receive initial state from E
        state = env.reset()   # state.shape -> (2,)
        reward_total = 0 
        steps = 0
        done = False
        while (not done):
                
            #Sample action according to current policy
            #action.shape = (1,1)
            action  = sess.run(action_tf_var, feed_dict={
                          state_placeholder: scale_state(state)})
            #Execute action and observe reward & next state from E
            # next_state shape=(2,)    
            #env.step() requires input shape = (1,)
            next_state, reward, done, _ = env.step(
                                    np.squeeze(action, axis=0)) 
            steps +=1
            reward_total += reward
            #V_of_next_state.shape=(1,1)
            V_of_next_state = sess.run(V, feed_dict = 
                    {state_placeholder: scale_state(next_state)})  
            #Set TD Target
            #target = r + gamma * V(next_state)     
            target = reward + gamma * np.squeeze(V_of_next_state) 
            
            # td_error = target - V(s)
            #needed to feed delta_placeholder in actor training
            td_error = target - np.squeeze(sess.run(V, feed_dict = 
                        {state_placeholder: scale_state(state)})) 
            
            #Update actor by minimizing loss (Actor training)
            _, loss_actor_val  = sess.run(
                [training_op_actor, loss_actor], 
                feed_dict={action_placeholder: np.squeeze(action), 
                state_placeholder: scale_state(state), 
                delta_placeholder: td_error})
            #Update critic by minimizinf loss  (Critic training)
            _, loss_critic_val  = sess.run(
                [training_op_critic, loss_critic], 
                feed_dict={state_placeholder: scale_state(state), 
                target_placeholder: target})
            
            state = next_state
            #end while
        episode_history.append(reward_total)
        print("Episode: {}, Number of Steps : {}, Cumulative reward: {:0.2f}".format(
            episode, steps, reward_total))
        
        if np.mean(episode_history[-100:]) > 90 and len(episode_history) >= 101:
            print("****************Solved***************")
            print("Mean cumulative reward over 100 episodes:{:0.2f}" .format(
                np.mean(episode_history[-100:])))
            
            

Episode: 0, Number of Steps : 698, Cumulative reward: 71.48
Episode: 1, Number of Steps : 632, Cumulative reward: 73.92
Episode: 2, Number of Steps : 688, Cumulative reward: 70.95
Episode: 3, Number of Steps : 290, Cumulative reward: 88.38
Episode: 4, Number of Steps : 999, Cumulative reward: -42.01
Episode: 5, Number of Steps : 979, Cumulative reward: 58.99
Episode: 6, Number of Steps : 999, Cumulative reward: -39.12
Episode: 7, Number of Steps : 604, Cumulative reward: 79.44
Episode: 8, Number of Steps : 854, Cumulative reward: 65.54
Episode: 9, Number of Steps : 999, Cumulative reward: -36.22
Episode: 10, Number of Steps : 776, Cumulative reward: 71.42
Episode: 11, Number of Steps : 999, Cumulative reward: -38.01
Episode: 12, Number of Steps : 826, Cumulative reward: 69.56
Episode: 13, Number of Steps : 999, Cumulative reward: -35.81
Episode: 14, Number of Steps : 999, Cumulative reward: -38.05
Episode: 15, Number of Steps : 843, Cumulative reward: 70.65
Episode: 16, Number of Steps

Episode: 128, Number of Steps : 186, Cumulative reward: 91.21
****************Solved***************
Mean cumulative reward over 100 episodes:90.55
Episode: 129, Number of Steps : 121, Cumulative reward: 93.82
****************Solved***************
Mean cumulative reward over 100 episodes:90.64
Episode: 130, Number of Steps : 119, Cumulative reward: 94.29
****************Solved***************
Mean cumulative reward over 100 episodes:90.77
Episode: 131, Number of Steps : 189, Cumulative reward: 91.90
****************Solved***************
Mean cumulative reward over 100 episodes:90.82
Episode: 132, Number of Steps : 163, Cumulative reward: 90.12
****************Solved***************
Mean cumulative reward over 100 episodes:90.94
Episode: 133, Number of Steps : 182, Cumulative reward: 89.53
****************Solved***************
Mean cumulative reward over 100 episodes:90.93
Episode: 134, Number of Steps : 287, Cumulative reward: 88.27
****************Solved***************
Mean cumulative re

Episode: 185, Number of Steps : 237, Cumulative reward: 90.82
****************Solved***************
Mean cumulative reward over 100 episodes:91.73
Episode: 186, Number of Steps : 148, Cumulative reward: 93.54
****************Solved***************
Mean cumulative reward over 100 episodes:91.76
Episode: 187, Number of Steps : 149, Cumulative reward: 92.21
****************Solved***************
Mean cumulative reward over 100 episodes:91.77
Episode: 188, Number of Steps : 195, Cumulative reward: 91.74
****************Solved***************
Mean cumulative reward over 100 episodes:91.76
Episode: 189, Number of Steps : 98, Cumulative reward: 94.95
****************Solved***************
Mean cumulative reward over 100 episodes:91.82
Episode: 190, Number of Steps : 155, Cumulative reward: 91.74
****************Solved***************
Mean cumulative reward over 100 episodes:91.80
Episode: 191, Number of Steps : 214, Cumulative reward: 89.73
****************Solved***************
Mean cumulative rew

Episode: 241, Number of Steps : 134, Cumulative reward: 92.89
****************Solved***************
Mean cumulative reward over 100 episodes:92.36
Episode: 242, Number of Steps : 168, Cumulative reward: 93.51
****************Solved***************
Mean cumulative reward over 100 episodes:92.39
Episode: 243, Number of Steps : 117, Cumulative reward: 94.78
****************Solved***************
Mean cumulative reward over 100 episodes:92.44
Episode: 244, Number of Steps : 154, Cumulative reward: 93.75
****************Solved***************
Mean cumulative reward over 100 episodes:92.44
Episode: 245, Number of Steps : 184, Cumulative reward: 91.18
****************Solved***************
Mean cumulative reward over 100 episodes:92.47
Episode: 246, Number of Steps : 147, Cumulative reward: 91.68
****************Solved***************
Mean cumulative reward over 100 episodes:92.48
Episode: 247, Number of Steps : 140, Cumulative reward: 93.00
****************Solved***************
Mean cumulative re

Episode: 297, Number of Steps : 112, Cumulative reward: 94.08
****************Solved***************
Mean cumulative reward over 100 episodes:92.80
Episode: 298, Number of Steps : 112, Cumulative reward: 94.46
****************Solved***************
Mean cumulative reward over 100 episodes:92.81
Episode: 299, Number of Steps : 110, Cumulative reward: 94.04
****************Solved***************
Mean cumulative reward over 100 episodes:92.81
