# Monte Carlo Methods for prediction and control

In [55]:
import gym 
import numpy as np 
import time

### Environment: OpenAI Taxi-v3

For the purposes of this assignment, we use Taxi-v3 from OpenAI's Gym repository. This environment, its states, actions and goals are detailed on the following web page: https://www.gymlibrary.dev/environments/toy_text/taxi/ 

In [56]:
env = gym.make("Taxi-v3", render_mode = "human")

In [57]:
state_space = env.observation_space
action_space = env.action_space

print("We have {} action space and {} state space".format(action_space, state_space))

We have Discrete(6) action space and Discrete(500) state space


### On-policy Monte Carlo Control 

In [58]:
#This function returns an array of action probabilities for a given state (a polic) 
#this policy is designed to be epsilon-greedy in relation to the state action value function Q 
def policy_fn(Q, num_actions, e, state):
    action_probabilities = np.ones(num_actions) * (e/num_actions)
    highest_action_value = np.argmax(Q[state])
    action_probabilities[highest_action_value] += 1 - e
    
    return action_probabilities

In [59]:
num_episodes = 100
e = 0.5
gamma = 0.3

In [None]:
Q = np.ones((state_space.n, action_space.n))
returns = [[[]]*action_space.n for i in range(state_space.n)]
pi = np.zeros((state_space.n, action_space.n))

for i in range(1, num_episodes+1):
    
    episode = []
    state = env.reset()
    cumulative_reward = 0 
    
    for j in range(10000): 
        
        if type(state)==tuple: 
            state = state[0]
        
        pi[state] = policy_fn(Q, action_space.n, e, state)
        action = np.random.choice(np.arange(action_space.n), p = pi[state])
        print(action)
        next_state, reward, terminated, truncated, step_dict = env.step(action)
        #env.render()
        episode.append((state, action, reward))
        
        #print(episode[(state, action)])
        if terminated: 
            break 
        state = next_state
        
        
    visited = []
    return_till_now = 0 
    returns = {}
    states_in_episode = []
    for i, (state, action, reward) in enumerate(episode): 
        if state not in states_in_episode: 
            states_in_episode.append(state)
        if (state, action) in visited: 
            continue
        else: 
            visited.append((state, action))
            G = sum([r*(gamma**j) for j, (s, a, r) in enumerate(episode)])
            if (state, action) in returns.keys(): 
                returns[(state, action)].append(G)
            else: 
                returns[(state, action)] = [G]
            Q[state][action] = sum(returns[(state, action)])/len(returns[(state, action)])
            
    for state in states_in_episode: 
        pi[state] = policy_fn(Q, action_space.n, e, state)

In [None]:
pi

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       ...,
       [0.91666667, 0.01666667, 0.01666667, 0.01666667, 0.01666667,
        0.01666667],
       [0.01666667, 0.01666667, 0.91666667, 0.01666667, 0.01666667,
        0.01666667],
       [0.91666667, 0.01666667, 0.01666667, 0.01666667, 0.01666667,
        0.01666667]])

In [None]:
Q

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       ...,
       [ -1.25      ,  -1.25      ,  -1.25      ,  -1.25      ,
         -1.25      , -10.25      ],
       [ -1.25000018,  -3.05000004,  -1.25      ,  -1.25000018,
         -1.25000018,  -1.25      ],
       [ -1.25      ,  -1.25      ,  -1.25      ,  -1.25      ,
         -1.25      ,  -1.25      ]])

## Visualizing the learned policy

In [None]:
state_start = env.reset()
i = 0
for i in range(1000): 
    if type(state)==tuple: 
        state = state[0]
    
    env.render()
    #time.sleep(0.03)
    pi[state] = policy_fn(Q, action_space.n, e, state)
    action = np.random.choice(np.arange(action_space.n), p = pi[state])
    next_state, reward, terminated, truncated, step_dict = env.step(action)
    print("At step {}, reward = {} and termination: {}".format(i, reward, terminated))
    
    #print(episode[(state, action)])
    if terminated: 
        break 
    state = next_state

At step 0, reward = -1 and termination: False
At step 1, reward = -1 and termination: False
At step 2, reward = -1 and termination: False
At step 3, reward = -1 and termination: False
At step 4, reward = -1 and termination: False
At step 5, reward = -1 and termination: False
At step 6, reward = -1 and termination: False
At step 7, reward = -1 and termination: False
At step 8, reward = -1 and termination: False
At step 9, reward = -1 and termination: False
At step 10, reward = -1 and termination: False
At step 11, reward = -1 and termination: False
At step 12, reward = -10 and termination: False
At step 13, reward = -10 and termination: False
At step 14, reward = -10 and termination: False
At step 15, reward = -10 and termination: False
At step 16, reward = -10 and termination: False
At step 17, reward = -10 and termination: False
At step 18, reward = -10 and termination: False
At step 19, reward = -10 and termination: False
At step 20, reward = -10 and termination: False
At step 21, re

: 

### Notes 
The findings here seem pretty simple- Getting a the epsilon greedy policy learned by the on-policy MC control algorithm is not good enough... failing for the algorithm to get us into a terminal state. This could be because of the size of the state-action space, and the difficulty to learn the objective. In the training steps, we quit iterating after a fixed number of steps. This means we aren't really encountering a terminal state ever, making it near impossible for the policy to be able to pick up on that. Temporal Difference learning should be slightly better at learning from incomplete episodes. 