In [None]:
# Import Required Packages
import numpy as np
import gym
import matplotlib.pyplot as plt

In [None]:
# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')   #Instantiates an instance of the environment 
env.reset() #Resets the environment to an initial state

**state**: is a array([position,velocity])

**Initial state**:
* _velocity_ : The starting velocity of the car is always assigned to 0
* _position_ : The starting position of the car is assigned a uniform random value in [-0.6 , -0.4]

In [None]:
# parameters for q learning
epsilon = 0.8
discount = 0.9
learning_rate = 0.2 
episodes = 5000 # number of episodes

reduction = epsilon/episodes  


In [None]:
# Initialize variables to track rewards
reward_list = []
all_reward_list = []
ave_reward_list = []

In [None]:
# Determine size of discretized state space
states = (env.observation_space.high - env.observation_space.low) *\
        np.array([10, 100])
states = np.round(states, 0).astype(int) + 1

In [None]:
# Initialize Q table
q_table = np.random.uniform(
                low=-1, 
                high=1,
                size=(states[0], states[1],env.action_space.n))

In [None]:
def track_rewards(total_reward,i,reward_list,all_reward_list,ave_reward_list):
        reward_list.append(total_reward)
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            all_reward_list.extend(reward_list)
            reward_list = []

        if (i+1) % 100 == 0:
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))

In [None]:
def QLearning(epsilon,discount,learning_rate):
    for i in range(episodes):
        done = False # game is not over yet 
        total_reward, reward = 0,0 
        state = env.reset() #Resets the environment to an initial state
        
         # Discretize initial state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)

        while done != True:
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(q_table[state_adj[0], state_adj[1]])
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done,info = env.step(action)

            # Discretize state2
            state2_adj = (state2 - env.observation_space.low) * \
                np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)

                
            if done and state2[0]>= 0.5:# Allow for terminal states
                q_table[state_adj[0], state_adj[1], action] = reward
                
            else: # Adjust q_table value for current state
                delta = learning_rate*(reward +
                                  discount*np.max(q_table[state2_adj[0],
                                                    state2_adj[1]]) -
                                  q_table[state_adj[0], state_adj[1], action])
                q_table[state_adj[0], state_adj[1], action] += delta
            # Update variables
            total_reward += reward
            state_adj = state2_adj
        # Decay epsilon
        if epsilon > 0:
            epsilon -= reduction
            
        track_rewards(total_reward,i,reward_list,all_reward_list,ave_reward_list)

In [None]:
def best_episode_render(policy):
    observation= env.reset()
    done = False
    while done != True:
        state_adj = (observation - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
        action = policy[state_adj[0]][state_adj[1]]
        
        env.render()
        # proceed environment for each step
        # get observation, reward and done after each step
        observation, reward, done, _ = env.step(action)
   

In [None]:
# Run Q-learning algorithm ****
QLearning(epsilon,discount,learning_rate)

In [None]:
# render best episode
policy=np.argmax(q_table, axis=2)
best_episode_render(policy)

In [None]:
env.close()

In [None]:
# Plot ave Rewards
plt.plot(100*(np.arange(len(ave_reward_list)) + 1), ave_reward_list)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('Average rewards.jpg') 
plt.show()    
plt.close()