# On-Policy First-Visit MC Control (for $\epsilon$-soft policies), estimates $\pi\approx\pi_*$

In [2]:
import gymnasium as gym
import numpy as np
from tqdm import tqdm
import time

In [3]:
SEED = 42
np.random.seed(SEED)

In [4]:
def make_greedy_policy(Q):
    def policy(state):
        return np.argmax(Q[state])
    return policy

In [5]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy(state):
        action_probs = np.ones(nA) * (epsilon / nA)
        best_action = np.argmax(Q[state])
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    return policy

In [6]:
def generate_episode(env, policy, max_steps=100):
    episode = []
    state = env.reset()[0]
    done = False
    steps = 0

    while not done and steps < max_steps:
        action_probs = policy(state)
        action = np.random.choice(len(action_probs), p=action_probs)
        next_state, reward, done, _, _ = env.step(action)

        if not done:
            reward -= 0.01  # Step penalty
        elif reward == 0:
            reward = -1.0   # Hole penalty
        else:
            reward = 5.0    # Reaching goal

        episode.append((state, action, reward))
        state = next_state
        steps += 1

    return episode

In [7]:
def on_policy_first_visit_mc_control(env, epsilon_start, num_episodes, gamma):
    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))

    epsilon_min = 0.05
    epsilon_decay = 0.9993  # Slower decay keeps more exploration

    for i in tqdm(range(num_episodes), desc="Training episodes"):
        epsilon = max(epsilon_min, epsilon_start * (epsilon_decay ** i))
        policy = make_epsilon_greedy_policy(Q, epsilon, nA)
        episode = generate_episode(env, policy)

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns_count[state][action] += 1
                Q[state][action] += (1 / returns_count[state][action]) * (G - Q[state][action])

    return make_greedy_policy(Q), Q

In [8]:
custom_map = [
    "SFFHF",
    "HFHFF",
    "FFFHF",
    "FHFFF",
    "HFFFG"
]

# Hyperparameters
n_episodes = 3000
gamma = 0.98
epsilon_start = 1.0

In [9]:
# Train environment
env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=False)
policy, Q = on_policy_first_visit_mc_control(env, epsilon_start, n_episodes, gamma)

Training episodes: 100%|██████████| 3000/3000 [00:01<00:00, 1505.34it/s]


In [None]:
# Save Q-table for later use
np.savez('results/frozenlake-qtable.npz', Q=Q)

In [12]:
env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=False, render_mode="human")
state = env.reset()[0] 
done = False

while not done:
    env.render()
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)
    time.sleep(0.3)
env.close()

  from pkg_resources import resource_stream, resource_exists


In [11]:
# 🎥 Re-run environment to record a successful episode using greedy policy
env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=False, render_mode="rgb_array")
state = env.reset()[0]
done = False

frames = []
max_steps = 100  # To prevent infinite loops in case of error

for _ in range(max_steps):
    frame = env.render()
    frames.append(frame)

    action = policy(state)
    state, reward, done, truncated, _ = env.step(action)

    if done:
        frames.append(env.render())  # Final frame (goal or hole)
        break

env.close()

# ✅ Save as looping GIF (loop=0 means infinite loop)
images = [Image.fromarray(frame) for frame in frames]
images[0].save(
    'frozenlake_agent.gif',
    save_all=True,
    append_images=images[1:],
    duration=100,  # Adjust speed
    loop=0
)

print("GIF saved as frozenlake_agent.gif")


NameError: name 'Image' is not defined

In [32]:









# Create a GIF of the agent following the greedy policy
env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=False, render_mode="rgb_array")
state = env.reset()[0]
done = False

frames = []
while not done:
    frames.append(env.render())
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)

env.close()

# Convert frames to a GIF
images = [Image.fromarray(frame) for frame in frames]
images[0].save('frozenlake_agent.gif', save_all=True, append_images=images[:], duration=100, loop=0)


In [17]:
# 🎥 Re-run environment to record a successful episode using greedy policy
env = gym.make("FrozenLake-v1", desc=custom_map_5x5, is_slippery=False, render_mode="rgb_array")
state = env.reset()[0]
done = False

frames = []
max_steps = 100  # To prevent infinite loops in case of error

for _ in range(max_steps):
    frame = env.render()
    frames.append(frame)

    action = policy(state)
    state, reward, done, truncated, _ = env.step(action)

    if done:
        frames.append(env.render())  # Final frame (goal or hole)
        break

env.close()

# ✅ Save as looping GIF (loop=0 means infinite loop)
images = [Image.fromarray(frame) for frame in frames]
images[0].save(
    'frozenlake_agent.gif',
    save_all=True,
    append_images=images[1:],
    duration=400,  # Adjust speed
    loop=0
)

print("GIF saved as frozenlake_agent.gif")


GIF saved as frozenlake_agent.gif


In [2]:
import gymnasium as gym
import numpy as np
import time
from tqdm import tqdm

In [2]:
def make_greedy_policy(Q, nA):
    def policy(state):
        best_action = max(range(nA), key=lambda a: Q[(state, a)])
        return best_action
    return policy

In [3]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy(state):
        action_probs = np.ones(nA) * (epsilon / nA)
        best_action = max(range(nA), key=lambda a: Q[(state, a)])
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    return policy

In [4]:
def generate_episode(env, policy):

    episode = []
    state = env.reset()[0]
    done = False

    while not done:
        action_probs = policy(state)
        action = np.random.choice(len(action_probs), p=action_probs)
        next_state, reward, done, _, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state

    return episode

In [None]:
def on_policy_first_visit_mc_control(env, epsilon, num_episodes, gamma):

    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))

    for i in tqdm(range(num_episodes), desc="Training episodes"):

        # epsilon = max(epsilon_min, epsilon_start * (epsilon_decay ** i))
        epsilon = max(0.01, epsilon * 0.9999) # annealing epsilon
        policy = make_epsilon_greedy_policy(Q, epsilon, nA)

        episode = generate_episode(env, policy)
        G = 0
        visited = set()

        for t in reversed(range(len(episode))):

            state, action, reward = episode[t]
            G = gamma * G + reward

            if (state, action) not in visited:
                visited.add((state, action))
                returns_count[state][action] += 1
                Q[state][action] += (1/returns_count[state][action]) * (G - Q[state][action])

    return make_greedy_policy(Q, nA), Q

## FrozenLake-v1 8x8

In [14]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import time

# Set seeds for reproducibility
SEED = 42
np.random.seed(SEED)

def make_greedy_policy(Q, nA):
    def policy(state):
        return np.argmax(Q[state])
    return policy

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy(state):
        action_probs = np.ones(nA) * (epsilon / nA)
        best_action = np.argmax(Q[state])
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    return policy

def generate_episode(env, policy, max_steps=100):
    episode = []
    state = env.reset()[0]
    done = False
    steps = 0

    while not done and steps < max_steps:
        action_probs = policy(state)
        action = np.random.choice(len(action_probs), p=action_probs)
        next_state, reward, done, _, _ = env.step(action)

        if not done:
            reward -= 0.01  # Step penalty
        elif reward == 0:
            reward = -1.0   # Hole penalty
        else:
            reward = 5.0    # Reaching goal

        episode.append((state, action, reward))
        state = next_state
        steps += 1

    return episode

def on_policy_first_visit_mc_control(env, epsilon_start, num_episodes, gamma):
    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))

    epsilon_min = 0.05
    epsilon_decay = 0.9993  # Slower decay keeps more exploration

    for i in tqdm(range(num_episodes), desc="Training episodes"):
        epsilon = max(epsilon_min, epsilon_start * (epsilon_decay ** i))
        policy = make_epsilon_greedy_policy(Q, epsilon, nA)
        episode = generate_episode(env, policy)

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns_count[state][action] += 1
                Q[state][action] += (1 / returns_count[state][action]) * (G - Q[state][action])

    return make_greedy_policy(Q, nA), Q

# 🌟 Use the 5x5 custom map
custom_map_5x5 = [
    "SFFHF",
    "HFHFF",
    "FFFHF",
    "FHFFF",
    "HFFFG"
]

# 🚀 Hyperparameters
n_episodes = 3000  # Lowered for speed + good enough convergence
gamma = 0.98
epsilon_start = 1.0

# 🎮 Train environment
env = gym.make("FrozenLake-v1", desc=custom_map_5x5, is_slippery=False)
policy, Q = on_policy_first_visit_mc_control(env, epsilon_start, n_episodes, gamma)


Training episodes: 100%|██████████| 3000/3000 [00:01<00:00, 1500.31it/s]


In [15]:
env = gym.make("FrozenLake-v1", desc=custom_map_5x5, is_slippery=False, render_mode="human")
state = env.reset()[0] 
done = False

while not done:
    env.render()
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)
    time.sleep(0.3)
env.close()

In [7]:
env.close()

In [12]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
from tqdm import tqdm

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)

def make_greedy_policy(Q, nA):
    def policy(state):
        return np.argmax(Q[state])
    return policy

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy(state):
        action_probs = np.ones(nA) * (epsilon / nA)
        best_action = np.argmax(Q[state])
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    return policy

def generate_episode(env, policy, max_steps=100):
    episode = []
    state = env.reset()[0]
    done = False
    steps = 0

    while not done and steps < max_steps:
        action_probs = policy(state)
        action = np.random.choice(len(action_probs), p=action_probs)
        next_state, reward, done, _, _ = env.step(action)

        episode.append((state, action, reward))
        state = next_state
        steps += 1

    return episode

def on_policy_first_visit_mc_control(env, epsilon_start, num_episodes, gamma):
    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))

    epsilon_min = 0.05
    epsilon_decay = 0.9993

    for i in tqdm(range(num_episodes), desc="Training episodes"):
        epsilon = max(epsilon_min, epsilon_start * (epsilon_decay ** i))
        policy = make_epsilon_greedy_policy(Q, epsilon, nA)
        episode = generate_episode(env, policy)

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns_count[state][action] += 1
                Q[state][action] += (1 / returns_count[state][action]) * (G - Q[state][action])

    return make_greedy_policy(Q, nA), Q

# Hyperparameters
n_episodes = 30000
gamma = 0.98
epsilon_start = 1.0

# Use default FrozenLake 4x4 environment (slippery by default)
env = gym.make("FrozenLake-v1", is_slippery=False)

policy, Q = on_policy_first_visit_mc_control(env, epsilon_start, n_episodes, gamma)


Training episodes: 100%|██████████| 30000/30000 [00:25<00:00, 1190.98it/s]


In [13]:
import time

env = gym.make("FrozenLake-v1", render_mode="human")
state = env.reset()[0]
done = False

while not done:
    env.render()
    action = policy(state)
    state, reward, done, truncated, info = env.step(action)
    time.sleep(0.3)

env.close()


In [None]:
def make_greedy_policy(Q, nA):
    def policy(state):
        best_action = max(range(nA), key=lambda a: Q[(state, a)])
        return best_action
    return policy

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy(state):
        action_probs = np.ones(nA) * (epsilon / nA)
        best_action = max(range(nA), key=lambda a: Q[(state, a)])
        action_probs[best_action] += (1.0 - epsilon)
        return action_probs
    return policy

def generate_episode(env, policy):
    episode = []
    state = env.reset()[0]
    done = False
    while not done:
        action_probs = policy(state)
        action = np.random.choice(len(action_probs), p=action_probs)
        next_state, reward, done, _, _ = env.step(action)

        if not done:
            reward -= 0.01 
        elif reward == 0:
            reward = -1.0 
        else:
            reward = +5.0

        episode.append((state, action, reward))
        state = next_state
    return episode

def on_policy_first_visit_mc_control(env, epsilon_start, num_episodes, gamma):

    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))
    
    epsilon_min = 0.01
    epsilon_decay = 0.9995
    
    for i in tqdm(range(num_episodes), desc="Training episodes"):
 
        epsilon = max(epsilon_min, epsilon_start * (epsilon_decay ** i))
        policy = make_epsilon_greedy_policy(Q, epsilon, nA)
        episode = generate_episode(env, policy)

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = gamma * G + reward
            if (state, action) not in visited:
                visited.add((state, action))
                returns_count[state][action] += 1
                Q[state][action] += (1/returns_count[state][action]) * (G - Q[state][action])
                
    return make_greedy_policy(Q, nA), Q

In [26]:
custom_map = [
    "SFFFFH",
    "HFHFHF",
    "FFFHFF",
    "FHFHFF",
    "FFHFFF",
    "HFFFHG"
]


# Good hyperparameters for custom 6x6 map
n_episodes = 100000  # More episodes for larger state space
gamma = 0.95
epsilon_start = 1.0

env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=False)
policy, Q = on_policy_first_visit_mc_control(env, epsilon_start, n_episodes, gamma)

Training episodes:   9%|▉         | 8951/100000 [04:55<50:03, 30.31it/s]   


KeyboardInterrupt: 

In [25]:
env = gym.make("FrozenLake-v1", desc=custom_map, is_slippery=False, render_mode="human")
state = env.reset()[0] 
done = False

while not done:
    env.render()
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)
    time.sleep(0.3)
env.close()

In [18]:
n_episodes = 50000
gamma = 0.95
epsilon_start = 1.0

env = gym.make("FrozenLake-v1", is_slippery=False)
policy, Q = on_policy_first_visit_mc_control(env, epsilon_start, n_episodes, gamma)

Training episodes: 100%|██████████| 50000/50000 [00:20<00:00, 2487.53it/s]


In [19]:
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human")
state = env.reset()[0] 
done = False

while not done:
    env.render()
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)
    time.sleep(0.3)
env.close()

In [None]:
np.savez("results/policy_iteration.npz", Q=Q)

In [42]:
n_episodes = 500000
gamma = 0.99  # Discount factor
epsilon_decay = 0.0001

env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False)
policy, Q = on_policy_first_visit_mc_control(env, 1, n_episodes, gamma)

Training episodes: 100%|██████████| 500000/500000 [06:51<00:00, 1215.94it/s]


In [43]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False, render_mode="human")

state = env.reset()[0] 
done = False
step = 0

while not done and step < 25:
    env.render()
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)
    step += 1
    time.sleep(0.3)

env.close()

In [39]:
n_episodes = 500000
gamma = 0.99  # Discount factor
epsilon_decay = 0.0001

env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True)
policy, Q = on_policy_first_visit_mc_control(env, 1, n_episodes, gamma)

Training episodes: 100%|██████████| 500000/500000 [38:15<00:00, 217.85it/s]  


In [41]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True, render_mode="human")

state = env.reset()[0] 
done = False
step = 0

while not done and step < 200:
    env.render()
    action = policy(state)
    state, reward, done, truncate, info = env.step(action)
    step += 1
    time.sleep(0.1)

env.close()

In [77]:
import numpy as np

SEED = 42
np.random.seed(SEED)

print("Random numbers with seed 42:")
for _ in range(5):
    print(np.random.rand())


Random numbers with seed 42:
0.3745401188473625
0.9507143064099162
0.7319939418114051
0.5986584841970366
0.15601864044243652


In [75]:
# import numpy as np

print("Random numbers without seed:")
for _ in range(5):
    print(np.random.rand())

Random numbers without seed:
0.6118528947223795
0.13949386065204183
0.29214464853521815
0.3663618432936917
0.45606998421703593


In [84]:
SEED = 42
np.random.seed(SEED)

np.random.rand()
np.random.rand()

np.random.seed(SEED)

np.random.rand()

import numpy as np

np.random.rand()

0.9507143064099162