# Monte Carlo Methods for prediction and control

In [1]:
import gym 
import numpy as np 

### Environment: OpenAI Taxi-v3

For the purposes of this assignment, we use Taxi-v3 from OpenAI's Gym repository. This environment, its states, actions and goals are detailed on the following web page: https://www.gymlibrary.dev/environments/toy_text/taxi/ 

In [2]:
env = gym.make("Taxi-v2", render_mode="rgb_array")
env.reset()
env.render()
state_space = env.observation_space
action_space = env.action_space

print("We have {} action space and {} state space".format(action_space, state_space))

We have Discrete(6) action space and Discrete(500) state space


### On-policy Monte Carlo Control 

In [3]:
#This function returns an array of action probabilities for a given state (a polic) 
#this policy is designed to be epsilon-greedy in relation to the state action value function Q 
def policy_fn(Q, num_actions, e, state):
    action_probabilities = np.ones(num_actions) * (e/num_actions)
    highest_action_value = np.argmax(Q[state])
    action_probabilities[highest_action_value] += 1 - e
    
    return action_probabilities

In [4]:
num_episodes = 100
e = 0.1
gamma = 0.2

In [5]:
Q = np.zeros((state_space.n, action_space.n))
returns = [[[]]*action_space.n for i in range(state_space.n)]
pi = np.zeros((state_space.n, action_space.n))

for i in range(1, num_episodes+1):
    
    episode = []
    state = env.reset()
    cumulative_reward = 0 
    
    for time in range(1000): 
        
        if type(state)==tuple: 
            state = state[0]
        
        pi[state] = policy_fn(Q, action_space.n, e, state)
        action = np.random.choice(np.arange(action_space.n), p = pi[state])
        next_state, reward, terminated, truncated, step_dict = env.step(action)
        #env.render()
        episode.append((state, action, reward))
        
        #print(episode[(state, action)])
        if terminated: 
            break 
        state = next_state
        
        
    visited = []
    return_till_now = 0 
    returns = {}
    states_in_episode = []
    for i, (state, action, reward) in enumerate(episode): 
        if state not in states_in_episode: 
            states_in_episode.append(state)
        if (state, action) in visited: 
            continue
        else: 
            visited.append((state, action))
            G = sum([r*(gamma**j) for j, (s, a, r) in enumerate(episode)])
            if (state, action) in returns.keys(): 
                returns[(state, action)].append(G)
            else: 
                returns[(state, action)] = [G]
            Q[state][action] = sum(returns[(state, action)])/len(returns[(state, action)])
            
    for state in states_in_episode: 
        pi[state] = policy_fn(Q, action_space.n, e, state)

  if not isinstance(terminated, (bool, np.bool8)):


In [6]:
print(pi)

[[0.         0.         0.         0.         0.         0.        ]
 [0.01666667 0.01666667 0.91666667 0.01666667 0.01666667 0.01666667]
 [0.01666667 0.91666667 0.01666667 0.01666667 0.01666667 0.01666667]
 ...
 [0.01666667 0.91666667 0.01666667 0.01666667 0.01666667 0.01666667]
 [0.01666667 0.01666667 0.01666667 0.01666667 0.91666667 0.01666667]
 [0.         0.         0.         0.         0.         0.        ]]


In [7]:
Q

array([[  0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
          0.      ],
       [ -1.25288 ,  -1.25288 ,   0.      ,   0.      ,  -1.25288 ,
         -1.25288 ],
       [ -1.250576,   0.      ,   0.      ,   0.      ,   0.      ,
          0.      ],
       ...,
       [ -1.25    ,   0.      ,   0.      ,   0.      ,  -1.25    ,
         -1.25    ],
       [-12.428   , -12.428   , -12.428   , -12.428   ,  -1.25    ,
        -12.428   ],
       [  0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
          0.      ]])

## Visualizing the learned policy

In [8]:
state_start = env.reset()
for time in range(1000): 
    
    if type(state)==tuple: 
        state = state[0]
    
    env.render()
    pi[state] = policy_fn(Q, action_space.n, e, state)
    action = np.random.choice(np.arange(action_space.n), p = pi[state])
    next_state, reward, terminated, truncated, step_dict = env.step(action)
    
    
    #print(episode[(state, action)])
    if terminated: 
        break 
    state = next_state