In [3]:
import numpy as np
import gymnasium as gym
import random

# TASK 1 - Monte Carlo with a random policy

In [4]:
# Initialize the Frozen Lake environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Parameters
n_episodes = 5000  # Number of episodes
gamma = 0.9        # Discount factor

# Initialize value table to zeros
V = np.zeros(env.observation_space.n)  # State-value function
returns_sum = np.zeros(env.observation_space.n)
returns_count = np.zeros(env.observation_space.n)

# Function to generate an episode following a random policy
def generate_episode(env):
    episode = []
    state, _ = env.reset()  # Only take the state, ignore additional info
    done = False
    while not done:
        action = env.action_space.sample()  # Random action
        next_state, reward, done, _, *_ = env.step(action)  # Extract only necessary info
        episode.append((state, action, reward))
        state = next_state
    return episode

# Monte Carlo with First-Visit approach
for episode in range(n_episodes):
    episode_data = generate_episode(env)
    visited_states = set()  # Track first visits to each state
    G = 0
    
    # Calculate returns and update state-value function
    for t in reversed(range(len(episode_data))):
        state, action, reward = episode_data[t]
        G = gamma * G + reward  # Compute return

        # First-Visit MC: update only if state hasn't been visited in this episode
        if state not in visited_states:
            visited_states.add(state)
            returns_sum[state] += G
            returns_count[state] += 1
            V[state] = returns_sum[state] / returns_count[state]

# Display learned value function
print("Estimated State-Value Function (V):")
print(V.reshape((int(np.sqrt(env.observation_space.n)), -1)))


Estimated State-Value Function (V):
[[0.00497334 0.00432366 0.01007191 0.00332477]
 [0.00717027 0.         0.02241027 0.        ]
 [0.01829068 0.05515168 0.09674542 0.        ]
 [0.         0.13376857 0.3961039  0.        ]]


# TASK 2 - Incremental Monte Carlo

In [5]:
# Initialize the Frozen Lake environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Parameters
n_episodes = 5000        # Number of episodes
gamma = 0.9              # Discount factor
epsilon = 1.0            # Initial exploration rate
min_epsilon = 0.1        # Minimum exploration rate
epsilon_decay = 0.995    # Decay rate for epsilon
alpha = 0.1              # Learning rate for incremental updates

# Initialize value table to zeros
V = np.zeros(env.observation_space.n)  # State-value function

# Function to choose action using ε-Greedy policy
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore: random action
    else:
        # Exploit: Choose action with highest state value (for Frozen Lake, random choice is sufficient)
        return env.action_space.sample()  # Actions are random under Monte Carlo without Q-values

# Function to generate an episode following ε-Greedy policy
def generate_episode():
    episode = []
    state, _ = env.reset()
    done = False
    while not done:
        action = choose_action(state)
        next_state, reward, done, _, *_ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
    return episode

# Incremental Monte Carlo
for episode in range(n_episodes):
    episode_data = generate_episode()
    G = 0
    
    # Loop through the episode in reverse for incremental updates
    for t in reversed(range(len(episode_data))):
        state, action, reward = episode_data[t]
        G = gamma * G + reward  # Compute return
        
        # Incremental update of state-value function
        V[state] += alpha * (G - V[state])

    # Decay ε to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Display learned value function
print("Estimated State-Value Function (V):")
print(V.reshape((int(np.sqrt(env.observation_space.n)), -1)))


Estimated State-Value Function (V):
[[1.33863730e-29 1.64760536e-18 6.45602498e-10 2.24356279e-17]
 [3.41484586e-13 0.00000000e+00 1.26014023e-04 0.00000000e+00]
 [1.27773292e-06 4.38329478e-03 2.12302652e-02 0.00000000e+00]
 [0.00000000e+00 1.69336862e-02 2.68238318e-01 0.00000000e+00]]


# TASK 3 - Q-Learning Integration

In [7]:
# Initialize the Frozen Lake environment
env = gym.make("FrozenLake-v1", is_slippery=True)

# Parameters
n_episodes = 5000       # Number of episodes
gamma = 0.9             # Discount factor
alpha = 0.1             # Learning rate
epsilon = 1.0           # Initial exploration rate
min_epsilon = 0.1       # Minimum exploration rate
epsilon_decay = 0.995   # Decay rate for epsilon

# Initialize Q-table to zeros
Q = np.zeros((env.observation_space.n, env.action_space.n))

# Function to choose action using ε-Greedy policy based on Q-values
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore: random action
    else:
        return np.argmax(Q[state])  # Exploit: action with max Q-value for the state

# Q-Learning algorithm
for episode in range(n_episodes):
    state, _ = env.reset()
    done = False

    while not done:
        # Choose action with ε-Greedy policy
        action = choose_action(state)
        next_state, reward, done, _, *_ = env.step(action)

        # Q-Learning update
        best_next_action = np.argmax(Q[next_state])  # Best action at next state
        Q[state, action] += alpha * (reward + gamma * Q[next_state, best_next_action] - Q[state, action])

        state = next_state  # Move to the next state

    # Decay ε to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

# Display learned Q-table
print("Learned Q-Table:")
print(Q)

# Optional: Derive and display the policy from Q-table
policy = np.argmax(Q, axis=1)
print("Derived Policy (best action for each state):")
print(policy.reshape((int(np.sqrt(env.observation_space.n)), -1)))


Learned Q-Table:
[[0.0480898  0.04769605 0.04912855 0.04742664]
 [0.03517695 0.0347992  0.03896944 0.03997207]
 [0.04108799 0.04104216 0.04170399 0.04021703]
 [0.03063678 0.02490211 0.02457026 0.03341926]
 [0.05757836 0.04816095 0.06772309 0.03771445]
 [0.         0.         0.         0.        ]
 [0.05630977 0.04687008 0.04044596 0.0216042 ]
 [0.         0.         0.         0.        ]
 [0.07329008 0.09572027 0.09492424 0.12856263]
 [0.14161543 0.19515744 0.19742256 0.16016432]
 [0.15775859 0.15681431 0.16307182 0.12599444]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.19117669 0.25327488 0.30759128 0.30780088]
 [0.41920573 0.49072502 0.50110053 0.47353171]
 [0.         0.         0.         0.        ]]
Derived Policy (best action for each state):
[[2 3 2 3]
 [2 0 0 0]
 [3 2 2 0]
 [0 3 2 0]]
