In [29]:
import numpy as np
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

In [30]:
desc=["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
# env = gym.make('FrozenLake-v1', desc=desc, map_name="5x4", is_slippery=False, render_mode='human')
env = gym.make('FrozenLake-v1', desc=desc, map_name="5x4", is_slippery=False)
observation, info = env.reset()

In [31]:
# Custom rewards
custom_rewards = {
    'S': 0.0,  # Reward for frozen tiles (very small positive reward)
    'F': -0.75,  # Reward for falling in a hole (negative reward)
    'G': 1.0,   # Reward for reaching the goal (the "gift" state)
}

# Map custom rewards to the environment's reward table
env.env.rewards = custom_rewards

In [32]:
# Custom policy to avoid edges
def custom_policy(state):
    if state % 4 == 0:  # Agent is at leftmost column
        return [1, 2, 3]  # Avoid going left
    elif state % 4 == 3:  # Agent is at rightmost column
        return [0, 1, 3]  # Avoid going right
    elif state < 4:  # Agent is at top row
        return [0, 1, 2]  # Avoid going up
    elif state > 15:  # Agent is at bottom row
        return [0, 2, 3]  # Avoid going down
    else:
        return [0, 1, 2, 3]  # All actions are allowed

In [33]:
# Initialize Q-table with zeros
Q = np.random.rand(env.observation_space.n, env.action_space.n) * 0.01

# Hyperparameters
learning_rate = 0.8
discount_factor = 0.95
epsilon = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001
num_episodes = 1000

In [34]:
for episode in range(num_episodes):
    state_tuple = env.reset()  # State is a tuple
    state = state_tuple[0]  # Extract the integer state value
    done = False

    # Reset state visits count for the new episode
    state_visits = {s: 0 for s in range(env.observation_space.n)}

    while not done:
        # Choose action using epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = np.random.choice(custom_policy(state))  # Custom policy
        else:
            action = np.argmax(Q[state, :])

        print("Episode:", episode)
        print("Action:", action)

        # Take action and observe the next state, reward, done flag, and info
        step_result = env.step(action)

        next_state = step_result[0]  # Extract the next state tuple
        reward = step_result[1]  # Extract the reward
        terminated = step_result[2]  # Extract the done flags
        truncated = step_result[3] # Extract the done flags
        done = terminated or truncated

        # Update state visits count
        state_visits[next_state] += 1

        # Calculate penalty for visiting the same state
        visit_penalty = -0.01 * (2 ** state_visits[next_state])

        # Check if the agent stayed in the same state
        if next_state == state:
            reward = visit_penalty
        else:
            # Check for falling into the ice
            if terminated and reward == 0:
                reward = custom_rewards["F"]  # Penalty for falling into the ice
            elif not terminated:
                reward = custom_rewards["S"]  # Reward for a safe move
            reward += visit_penalty  # Add penalty for repeated visits

        # Update Q-value using SARSA formula
        next_action = np.argmax(Q[next_state, :])
        Q[state, action] = Q[state, action] + learning_rate * (reward + discount_factor * Q[next_state, next_action] - Q[state, action])

        # Decay the exploration rate
        epsilon = max(min_exploration_rate, epsilon * exploration_decay_rate)

        print("Step Result:", step_result)
        print("State Tuple:", state_tuple)
        print("State:", state)
        print("New State:", next_state)
        print("Next Action:", next_action)
        print("Reward:", reward)
        print("Done:", done)
        print("New Q-Value", Q[state, action])
        print("--------NEXT--------")

        state = next_state

Episode: 0
Action: 2
Step Result: (1, 0.0, False, False, {'prob': 1.0})
State Tuple: (0, {'prob': 1})
State: 0
New State: 1
Next Action: 2
Reward: -0.02
Done: False
New Q-Value -0.008581700729438951
--------NEXT--------
Episode: 0
Action: 2
Step Result: (2, 0.0, False, False, {'prob': 1.0})
State Tuple: (0, {'prob': 1})
State: 1
New State: 2
Next Action: 1
Reward: -0.02
Done: False
New Q-Value -0.009939729299168098
--------NEXT--------
Episode: 0
Action: 1
Step Result: (6, 0.0, True, False, {'prob': 1.0})
State Tuple: (0, {'prob': 1})
State: 2
New State: 6
Next Action: 1
Reward: -0.77
Done: True
New Q-Value -0.6074149929355194
--------NEXT--------
Episode: 1
Action: 1
Step Result: (4, 0.0, False, False, {'prob': 1.0})
State Tuple: (0, {'prob': 1})
State: 0
New State: 4
Next Action: 2
Reward: -0.02
Done: False
New Q-Value -0.008322653827990287
--------NEXT--------
Episode: 1
Action: 2
Step Result: (5, 0.0, True, False, {'prob': 1.0})
State Tuple: (0, {'prob': 1})
State: 4
New State: 5
N

KeyboardInterrupt: 

: 

In [None]:
env.close()