# Vanilla (average based) On-Policy Monte Carlo Control

In [1]:
import gym 
import numpy as np 
import time

### Environment: OpenAI Taxi-v3

For the purposes of this assignment, we use Taxi-v3 from OpenAI's Gym repository. This environment, its states, actions and goals are detailed on the following web page: https://www.gymlibrary.dev/environments/toy_text/taxi/ 

In [2]:
env = gym.make("Taxi-v3", render_mode = "human")
env.reset()
env.render()

In [3]:
state_space = env.observation_space
action_space = env.action_space

print("We have {} action space and {} state space".format(action_space, state_space))

We have Discrete(6) action space and Discrete(500) state space


### On-policy Monte Carlo Control 

In [4]:
#This function returns an array of action probabilities for a given state (a polic) 
#this policy is designed to be epsilon-greedy in relation to the state action value function Q 
def policy_fn(Q, num_actions, e, state):
    action_probabilities = np.ones(num_actions) * (e/num_actions)
    highest_action_value = np.argmax(Q[state])
    action_probabilities[highest_action_value] += 1 - e
    
    return action_probabilities

In [5]:
num_episodes = 50
e = 0.5
gamma = 1

In [6]:
Q = np.ones((state_space.n, action_space.n))
returns = [[[]]*action_space.n for i in range(state_space.n)]
pi = np.zeros((state_space.n, action_space.n))

epochs_per_episode = []
for i in range(num_episodes):
    
    episode = []
    state = env.reset()
    cumulative_reward = 0 
    epoch = 0
    terminated = False 
    while not terminated: 
        
        epoch+=1
        print("Episode: {} Epoch: {}".format(i, epoch))
        if type(state)==tuple: 
            state = state[0]
        
        pi[state] = policy_fn(Q, action_space.n, e, state)
        action = np.random.choice(np.arange(action_space.n), p = pi[state])
        next_state, reward, terminated, truncated, step_dict = env.step(action)
        episode.append((state, action, reward))
        
        #print(episode[(state, action)])
        if terminated: 
            epochs_per_episode.append(epoch)
            break 
        state = next_state
        
    if i % 10: 
        e-=0.1    
    visited = []
    return_till_now = 0 
    returns = {}
    states_in_episode = []
    for k, (state, action, reward) in enumerate(episode): 
        if state not in states_in_episode: 
            states_in_episode.append(state)
        if (state, action) in visited: 
            continue
        else: 
            visited.append((state, action))
            G = sum([r*(gamma**j) for j, (s, a, r) in enumerate(episode)])
            if (state, action) in returns.keys(): 
                returns[(state, action)].append(G)
            else: 
                returns[(state, action)] = [G]
            Q[state][action] = sum(returns[(state, action)])/len(returns[(state, action)])
            
    for state in states_in_episode: 
        pi[state] = policy_fn(Q, action_space.n, e, state)

Episode: 0 Epoch: 1


  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0 Epoch: 2
Episode: 0 Epoch: 3
Episode: 0 Epoch: 4
Episode: 0 Epoch: 5
Episode: 0 Epoch: 6
Episode: 0 Epoch: 7
Episode: 0 Epoch: 8
Episode: 0 Epoch: 9
Episode: 0 Epoch: 10
Episode: 0 Epoch: 11


In [None]:
pi

array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.15, 0.15, 0.25, 0.15, 0.15, 0.15]])

In [None]:
Q

array([[ 1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,
         1.000e+00],
       [ 1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,
         1.000e+00],
       [ 1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,
         1.000e+00],
       ...,
       [ 1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,
         1.000e+00],
       [ 1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,  1.000e+00,
         1.000e+00],
       [-2.324e+03, -2.324e+03,  1.000e+00, -2.324e+03,  1.000e+00,
        -2.324e+03]])

## Visualizing the learned policy

In [None]:
state_start = env.reset()
i = 0
e = 0.03 #make policy less exploratory while testing 
for i in range(1000): 
    if type(state)==tuple: 
        state = state[0]
    
    env.render()
    #time.sleep(0.03)
    pi[state] = policy_fn(Q, action_space.n, e, state)
    action = np.random.choice(np.arange(action_space.n), p = pi[state])
    next_state, reward, terminated, truncated, step_dict = env.step(action)
    print("At step {}, reward = {} and termination: {}".format(i, reward, terminated))
    
    #print(episode[(state, action)])
    if terminated: 
        break 
    state = next_state

### Results 
Due to the extremely stochastic nature of the average based monte carlo algorithm during the first few episodes, runtime per episode was too high to keep running. However, this led to some learnings: As the size of the state and action spaces grows, the number of epochs required to reach a terminal state grows rapidly. Since this method learns offline (ie. not updating its values during interaction) the learning through the first few episodes is extremely slow. In the next notebook I will implement a temporal difference learning model that learns online and compare these two results. 