Monte Carlo methods are ways of solving the reinforcement learning problem based on averaging sample returns.  We assume experience is divided into episodes, and that all episodes eventually terminate no matter what actions are selected. Only on the completion of an episode are value estimates and policies changed. Monte Carlo methods can thus be incremental in an episode-by-episode sense, but not in a step-by-step (online) sense. The term “Monte Carlo” is often used more broadly for any estimation method whose operation involves a significant random component.

![Monte Carlo Prediction](images/pi_30k.webp)

![Monte Carlo Prediction](images/mc-pred.png)

![Cycle](images/mcc-cycle.png)

![MCC](images/mcc-alg.png)

In [27]:
import itertools

from collections import deque

import random
import gym
import torch
from torch.utils.data import DataLoader
import numpy as np

from AI_agents.Environments.gym_problem import GymProblem
from AI_agents.Search.best_first_search import a_star

from IL.dataset import ImitationLearningDataset
from IL.evaluation import evaluate_policy
from IL.ipython_vis import animate_policy
from IL.model import MLP
from IL.training import train_torch_classifier_sgd
import AI_agents.Search.utils as utils


# initialize env
env = gym.make("Taxi-v3").env
env.reset()

PASSENGER_IN_TAXI = 4  # passenger idx when in taxi
locs = env.unwrapped.locs  # environment locations

# random seed
seed = 42

In [28]:
??env

[1;31mType:[0m            OrderEnforcing
[1;31mString form:[0m     <OrderEnforcing<TaxiEnv<Taxi-v3>>>
[1;31mFile:[0m            c:\users\shyur\anaconda3\envs\fstma-tut03\lib\site-packages\gym\wrappers\order_enforcing.py
[1;31mSource:[0m         
[1;32mclass[0m [0mOrderEnforcing[0m[1;33m([0m[0mgym[0m[1;33m.[0m[0mWrapper[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;32mdef[0m [0m__init__[0m[1;33m([0m[0mself[0m[1;33m,[0m [0menv[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m        [0msuper[0m[1;33m([0m[1;33m)[0m[1;33m.[0m[0m__init__[0m[1;33m([0m[0menv[0m[1;33m)[0m[1;33m
[0m        [0mself[0m[1;33m.[0m[0m_has_reset[0m [1;33m=[0m [1;32mFalse[0m[1;33m
[0m[1;33m
[0m    [1;32mdef[0m [0mstep[0m[1;33m([0m[0mself[0m[1;33m,[0m [0maction[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m        [1;32massert[0m [0mself[0m[1;33m.[0m[0m_has_reset[0m[1;33m,[0m [1;34m"Cannot call env.step() before calling reset()"[0m[1;33m
[0m        

In [29]:
class TaxiMonteCarloPolicy:
    def __init__(self):
        # a container for the plan actions.
        self.cur_plan = deque()
    
    def __call__(self, obs):
        # if out of actions (finished previous plan), or if observation is not in current plan,
        # create a new plan.
        taxi_prob = GymProblem(env, env.unwrapped.s)
        actions = list(taxi_prob.get_applicable_actions(utils.Node(utils.State(obs, False), None, None, 0)))
        chosen_action = random.choice(actions)
        return chosen_action
    
helicopter_policy = TaxiMonteCarloPolicy()

In [30]:
# This code will run forever until it is interrupted
#animate_policy(env, helicopter_policy)

In [31]:
# trajectory struct
class Trajectory:
    def __init__(self, observations=None, actions=None, rewards=None):
        self.observations = observations or []
        self.actions = actions or []
        self.rewards = rewards or []
    
    def add_step(self, observation, action, reward):
        self.observations.append(observation)
        self.actions.append(action)
        self.rewards.append(reward)
        
    def __str__(self):
        return 'trajectory: ' + str(list(zip(self.observations, self.actions)))
    
    def __repr__(self):
        return str(self)

In [32]:
def get_trajectory(policy, max_trajectory_length=float('inf')):
    # init trajectory object
    trajectory = Trajectory()
    
    # get first observation
    obs = env.reset()
    
    # init first reward
    reward = 0
    # iterate and step in environment.
    # limit num actions for incomplete policies
    for i in itertools.count(start=1):
        action = policy(obs)
        old_obs = obs
        obs, reward, done, info = env.step(action)
        trajectory.add_step(old_obs, action, reward)
        
        if done or i >= max_trajectory_length:
            break
    
    return trajectory

trajectory = get_trajectory(helicopter_policy)
trajectory

trajectory: [(274, 4), (274, 5), (274, 4), (274, 5), (274, 1), (174, 2), (194, 4), (194, 3), (174, 2), (194, 4), (194, 3), (174, 2), (194, 2), (194, 5), (194, 0), (294, 3), (274, 3), (254, 2), (274, 5), (274, 4), (274, 0), (374, 5), (374, 2), (394, 5), (394, 5), (394, 1), (294, 2), (294, 1), (194, 0), (294, 2), (294, 3), (274, 2), (294, 1), (194, 4), (194, 4), (194, 5), (194, 0), (294, 1), (194, 1), (94, 0), (194, 0), (294, 0), (394, 4), (394, 5), (394, 4), (394, 0), (494, 3), (474, 3), (474, 3), (474, 1), (374, 5), (374, 4), (374, 3), (374, 3), (374, 1), (274, 2), (294, 2), (294, 5), (294, 2), (294, 3), (274, 3), (254, 1), (154, 2), (174, 0), (274, 1), (174, 0), (274, 3), (254, 3), (234, 2), (254, 5), (254, 1), (154, 0), (254, 1), (154, 4), (154, 5), (154, 4), (154, 3), (154, 1), (54, 1), (54, 3), (54, 1), (54, 5), (54, 2), (74, 0), (174, 4), (174, 3), (154, 3), (154, 1), (54, 4), (54, 2), (74, 0), (174, 4), (174, 4), (174, 3), (154, 5), (154, 1), (54, 3), (54, 5), (54, 4), (54, 4), (

In [33]:
def collect_data(policy, num_trajectories, max_trajectory_length=float('inf')):
    trajectories = []
    for _ in range(num_trajectories):
        trajectories.append(get_trajectory(policy, max_trajectory_length))

    return trajectories

# get the same trajectories every time!
env.seed(seed)

raw_data = collect_data(helicopter_policy, num_trajectories=1000)

In [34]:
from collections import defaultdict

def build_decision_dict(raw_data):
    state_action_scores = defaultdict(lambda: defaultdict(lambda: []))
    for trajectory in raw_data:
        reward_sum = 0
        for state, action, reward in reversed(list(zip(trajectory.observations, trajectory.actions, trajectory.rewards))):
            reward_sum += reward
            state_action_scores[state][action].append(reward_sum)
            
    for state, action_values in state_action_scores.items():
        for action, values_list in action_values.items():
            state_action_scores[state][action] = np.mean(values_list)
        state_action_scores[state] = max(state_action_scores[state], key=state_action_scores[state].get)
    return state_action_scores
    

In [35]:
class MCCPolicy:
    def __init__(self, state_action_map):
        self.state_action_map = state_action_map
    
    def __call__(self, obs):
        # preprocess observation
        return self.state_action_map[obs]

# create a policy driven by the MLP model that uses the same preprocessing function as in
# training
policy = MCCPolicy(build_decision_dict(raw_data))

In [9]:
total_reward, mean_reward = evaluate_policy(env, helicopter_policy, num_episodes=10000, seed=seed)
print('Monte Carlo Policy')
print('---------')
print(f'total reward over all episodes: {total_reward}')
print(f'mean reward per episode:        {mean_reward}')

  0%|          | 0/10000 [00:00<?, ?it/s]

Monte Carlo Policy
---------
total reward over all episodes: -983647
mean reward per episode:        -98.3647


In [10]:
total_reward, mean_reward = evaluate_policy(env, policy, num_episodes=10000, seed=seed)
print('Monte Carlo Control Policy')
print('-----------------')
print(f'total reward over all episodes: {total_reward}')
print(f'mean reward per episode:        {mean_reward}')

  0%|          | 0/10000 [00:00<?, ?it/s]

Monte Carlo Control Policy
-----------------
total reward over all episodes: -1445084
mean reward per episode:        -144.5084


In [12]:
# This code will run forever until it is interrupted
# animate_policy(env, policy)

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+
  (South)
num episodes completed:   688
total rewards:            -100646
mean rewards per episode: -146.29


In [36]:
def get_non_stationary_actions(taxi_prob, obs):
    node = utils.Node(utils.State(obs, False), None, None, 0)
    actions = list(taxi_prob.get_applicable_actions(node))
    applicable_actions = []
    for action in actions:
        if taxi_prob.get_successors(action, node)[0].state.get_key() != obs:
            applicable_actions.append(action)
    return applicable_actions

class TaxiMoneCarloNonStationaryPolicy:
    def __init__(self):
        # a container for the plan actions.
        self.cur_plan = deque()
    
    def __call__(self, obs):
        # if out of actions (finished previous plan), or if observation is not in current plan,
        # create a new plan.
        taxi_prob = GymProblem(env, env.unwrapped.s)
        actions = get_non_stationary_actions(taxi_prob, obs)
        chosen_action = random.choice(actions)
        return chosen_action
    
nonstationary_policy = TaxiMoneCarloNonStationaryPolicy()

In [None]:
import json
env.seed(seed)
presentation = False
state_action_map = None
nonstationary_control_policy = None

if presentation:
    with open('trajectories_nonstationary_20k.json', 'r') as fp:
        state_action_map = json.load(fp)
    nonstationary_control_policy = MCCPolicy(state_action_map)
else:
    raw_data_nonstationary = collect_data(nonstationary_policy, num_trajectories=20000)
    nonstationary_control_policy = MCCPolicy(build_decision_dict(raw_data_nonstationary))


In [18]:
#with open('trajectories_nonstationary_20k.json', 'w') as fp:
#    json.dump(nonstationary_control_policy.state_action_map, fp)

In [25]:
len(nonstationary_control_policy.state_action_map)

400

In [24]:
total_reward, mean_reward = evaluate_policy(env, nonstationary_control_policy, num_episodes=10000, seed=seed)
print('Monte Carlo Control Nonstationary Policy')
print('-----------------')
print(f'total reward over all episodes: {total_reward}')
print(f'mean reward per episode:        {mean_reward}')

  0%|          | 0/10000 [00:00<?, ?it/s]

Monte Carlo Control Nonstationary Policy
-----------------
total reward over all episodes: -16084
mean reward per episode:        -1.6084


In [15]:
# This code will run forever until it is interrupted
animate_policy(env, nonstationary_control_policy)

+---------+
|R: | : :[35m[42mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
num episodes completed:   4
total rewards:            44
mean rewards per episode: 11.00
