# Monte Carlo Methods for prediction and control

In [77]:
import gym 
import numpy as np 

### Environment: OpenAI Taxi-v3

For the purposes of this assignment, we use Taxi-v3 from OpenAI's Gym repository. This environment, its states, actions and goals are detailed on the following web page: https://www.gymlibrary.dev/environments/toy_text/taxi/ 

In [78]:
env = gym.make('Taxi-v3')
state_space = env.observation_space
action_space = env.action_space

print("We have {} action space and {} state space".format(action_space, state_space))

We have Discrete(6) action space and Discrete(500) state space


### On-policy Monte Carlo Control 

In [79]:
#we select pi to be some arbitrary e soft stochastic policy 
def policy_fn(Q, num_actions, e, state):
    if type(state)==tuple: 
        state = state[0]
    action_probabilities = np.ones(num_actions) * (e/num_actions)
    highest_action_value = np.argmax(Q[state])
    action_probabilities[highest_action_value] += 1 - e
    
    return action_probabilities

In [80]:
num_episodes = 1000
e = 0.01

In [101]:
Q = np.zeros((state_space.n, action_space.n))
returns = [[[]]*action_space.n for i in range(state_space.n)]

for i in range(1, num_episodes+1):
    
    episode = {}
    state = env.reset()
    cumulative_reward = 0 
    
    for time in range(100): 
        
        pi = policy_fn(Q, action_space.n, e, state)
        action = np.random.choice(np.arange(action_space.n), p = pi)
        next_state, reward, terminated, truncated, step_dict = env.step(action)
        cumulative_reward += reward
        
        if type(state)==tuple: 
            state = state[0]
        
        if (state, action) in episode.keys(): 
            episode[(state, action)].append(reward)
        else: 
            episode[(state, action)] = [reward]
        
        #print(episode[(state, action)])
        if terminated: 
            break 
        state = next_state
        
    visited = []
    return_till_now = 0 
    for i, (state, action) in enumerate(episode): 
        return_till_now += episode[(state, action)][0]
        print("{}, {}".format(return_till_now, episode[(state, action)][0]))
        episode[(state, action)].pop(0)
        if (state, action) in visited: 
            continue
    break
            
            
            
    

-1, -1
-2, -1
-3, -1
-4, -1
