In [1]:
# Import Required Packages
import numpy as np
import gym
import matplotlib.pyplot as plt

In [2]:
# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')   #Instantiates an instance of the environment 
env.reset() #Resets the environment to an initial state
#The initial state of an environment is returned when you reset the environment

array([-0.41522133,  0.        ], dtype=float32)

**state**: is a array([position,velocity])

**Initial state**:
* _velocity_ : The starting velocity of the car is always assigned to 0
* _position_ : The starting position of the car is assigned a uniform random value in [-0.6 , -0.4]

In [3]:
# parameters for q learning
epsilon = 0.8
discount = 0.9
learning_rate = 0.2 

In [4]:
# Initialize variables to track rewards
reward_list = []
ave_reward_list = []

In [5]:
# # Determine size of discretized state space
states = (env.observation_space.high - env.observation_space.low) *\
     np.array([10, 100])
states = np.round(states, 0).astype(int) + 1
states

array([19, 15])

In [6]:
# Initialize Q table
q_table = np.random.uniform(
                low=-1, 
                high=1,
                size=(states[0], states[1],env.action_space.n))
q_table

array([[[ 0.01136081, -0.37160969, -0.52371139],
        [ 0.60761938, -0.84503824, -0.384975  ],
        [ 0.26284387, -0.62960674, -0.57927812],
        [ 0.43828555, -0.4547618 , -0.82263434],
        [ 0.70874527,  0.08789341, -0.99119476],
        [ 0.74765469, -0.40527107, -0.55156547],
        [ 0.75567794, -0.41518232, -0.89388357],
        [-0.91189531, -0.18573058,  0.38716615],
        [ 0.12588328, -0.40022499, -0.84063606],
        [ 0.84205399,  0.83313816,  0.1377025 ],
        [ 0.98253416, -0.06548899, -0.07837875],
        [-0.67758104,  0.53000298, -0.42262475],
        [ 0.16536157, -0.47355564,  0.3497478 ],
        [ 0.66450351,  0.74626508, -0.82946621],
        [ 0.17646908,  0.95122002, -0.79095855]],

       [[ 0.44083635, -0.73196522,  0.20144742],
        [-0.84700297,  0.87298492,  0.54195529],
        [-0.07821677, -0.01637786, -0.42044839],
        [ 0.24483821, -0.9009427 , -0.99833524],
        [ 0.79649867, -0.32444287, -0.74831354],
        [ 0.392786

In [7]:
def track_rewards(total_reward,i):
        global reward_list, ave_reward_list
        reward_list.append(total_reward)
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []

        if (i+1) % 100 == 0:
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))

In [9]:
def QLearning(epsilon,discount,learning_rate,episodes):
    reduction = epsilon/episodes
    
    for i in range(episodes):
        done = False # game is not over yet 
        total_reward, reward = 0,0 
        state = env.reset() #Resets the environment to an initial state
        
         # Discretize initial state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)

        while done != True:
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(q_table[state_adj[0], state_adj[1]])
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # take an action and get next state and reward
            state2, reward, done,info = env.step(action)

            # Discretize state2
            state2_adj = (state2 - env.observation_space.low) * np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)

                
            if done and state2[0]>= 0.5:# Allow for terminal states
                q_table[state_adj[0], state_adj[1], action] = reward
                
            else: # Adjust q_table value for current state
                delta = learning_rate*(reward +
                                  discount*np.max(q_table[state2_adj[0],
                                                    state2_adj[1]]) -
                                  q_table[state_adj[0], state_adj[1], action])
                q_table[state_adj[0], state_adj[1], action] += delta
            # Update variables
            total_reward += reward
            state_adj = state2_adj
        # Decay epsilon
        if epsilon > 0:
            epsilon -= reduction
        track_rewards(total_reward,i)
            


In [11]:
def best_episode_render(policy):
    observation= env.reset()
    done = False
    while done != True:
        state_adj = (observation - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
        action = policy[state_adj[0]][state_adj[1]]
        
        env.render()
        # proceed environment for each step
        # get observation, reward and done after each step
        observation, reward, done, _ = env.step(action)
   

In [12]:
# Run Q-learning algorithm ****
QLearning(epsilon,discount,learning_rate,5000)

Episode 100 Average Reward: -200.0
Episode 200 Average Reward: -200.0
Episode 300 Average Reward: -200.0
Episode 400 Average Reward: -200.0
Episode 500 Average Reward: -200.0
Episode 600 Average Reward: -200.0
Episode 700 Average Reward: -200.0
Episode 800 Average Reward: -200.0
Episode 900 Average Reward: -200.0
Episode 1000 Average Reward: -200.0
Episode 1100 Average Reward: -200.0
Episode 1200 Average Reward: -200.0
Episode 1300 Average Reward: -200.0
Episode 1400 Average Reward: -200.0
Episode 1500 Average Reward: -200.0
Episode 1600 Average Reward: -200.0
Episode 1700 Average Reward: -200.0
Episode 1800 Average Reward: -200.0
Episode 1900 Average Reward: -200.0
Episode 2000 Average Reward: -200.0
Episode 2100 Average Reward: -200.0
Episode 2200 Average Reward: -200.0
Episode 2300 Average Reward: -200.0
Episode 2400 Average Reward: -200.0
Episode 2500 Average Reward: -200.0
Episode 2600 Average Reward: -200.0
Episode 2700 Average Reward: -200.0
Episode 2800 Average Reward: -199.74


In [16]:
# render best episode
policy = np.argmax(q_table, axis=2)
best_episode_render(policy)
policy 

array([[0, 0, 1, 0, 1, 2, 2, 0, 2, 1, 0, 1, 2, 1, 1],
       [0, 0, 1, 1, 0, 1, 0, 2, 2, 2, 2, 0, 2, 0, 2],
       [2, 0, 0, 2, 1, 2, 0, 2, 2, 2, 2, 2, 0, 2, 1],
       [0, 0, 0, 1, 0, 0, 1, 2, 2, 2, 0, 2, 2, 2, 2],
       [2, 1, 0, 1, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2],
       [2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 2, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2],
       [2, 2, 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 2, 2, 0],
       [0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1],
       [1, 2, 1, 0, 2, 0, 2, 1, 1, 2, 2, 0, 1, 1, 1],
       [2, 2, 1, 0, 0, 0, 1, 0, 1, 2, 2, 2, 1, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 0, 0, 2],
       [1, 1, 0, 1, 2, 0, 0, 1, 1, 2, 2, 2, 0, 2, 1],
       [1, 0, 2, 2, 1, 0, 2, 2, 2, 2, 2, 1, 2, 0, 1],
       [2, 2, 1, 0, 1, 0, 0, 0, 2, 2, 2, 1, 2, 0, 2],
       [1, 1, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 0, 2, 1],
       [1, 1, 2, 0, 1, 1, 0, 2, 2, 1, 0, 0, 1, 2, 2],
       [0, 2, 2, 1, 1, 0, 0,

In [None]:
# env.close()

In [None]:
# Plot ave Rewards
plt.plot(100*(np.arange(len(ave_reward_list)) + 1), ave_reward_list)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('Average rewards.jpg') 
plt.show()    
plt.close()