In [3]:
#Imports
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import pandas as pd
import numpy as np
import random
import sklearn
import matplotlib.pyplot as plt

In [4]:
#Setup Environment
#This is to setup a specific map
#desc=["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
#env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False, render_mode='human')

#env = gym.make('FrozenLake-v1', desc = generate_random_map(size=5), is_slippery = False, render_mode = 'human')
#observation, info = env.reset()

In [5]:
def create_environment():
    return gym.make('FrozenLake-v1', is_slippery=True)

In [6]:
def initialize_q_table(env):
    return np.zeros((env.observation_space.n, env.action_space.n))

In [7]:
def initialize_exploration_rates():
    return 1.0, 1.0, 0.01, 0.001  # exploration_rate, max_exploration_rate, min_exploration_rate, exploration_decay_rate


In [8]:
def choose_action(state, q_table, exploration_rate, env):
    if random.uniform(0, 1) > exploration_rate:
        action = np.argmax(q_table[state, :])
    else:
        action = env.action_space.sample()
    return action


In [9]:
def update_q_table(state, action, reward, new_state, q_table, learning_rate, discount_factor):
    q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
                             learning_rate * (reward + discount_factor * np.max(q_table[new_state, :]))


In [10]:
def train_agent(env, q_table, learning_rate, discount_factor, exploration_rates, num_episodes):
    exploration_rate, max_exploration_rate, min_exploration_rate, exploration_decay_rate = exploration_rates
    
    for episode in range(num_episodes):
        state_info = env.reset()
        state = state_info[0] if isinstance(state_info, tuple) else state_info  # Extract the integer state
        done = False

        while not done:
            action = choose_action(state, q_table, exploration_rate, env)
            state_info, reward, _, done, info = env.step(action)  # Adjusted unpacking here
            new_state = state_info[0] if isinstance(state_info, tuple) else state_info
    
            update_q_table(state, action, reward, new_state, q_table, learning_rate, discount_factor)
            state = new_state

        exploration_rate = min_exploration_rate + \
                           (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)


In [11]:
def evaluate_agent(env, q_table, num_test_episodes):
    total_rewards = 0

    for episode in range(num_test_episodes):
        state_info = env.reset()
        state = state_info[0] if isinstance(state_info, tuple) else state_info
        done = False
        episode_rewards = 0

        while not done:
            action = np.argmax(q_table[state, :])
            state_info, reward, _, done, info = env.step(action)  # Adjusted unpacking here
            state = state_info[0] if isinstance(state_info, tuple) else state_info
            episode_rewards += reward

        total_rewards += episode_rewards

    average_reward = total_rewards / num_test_episodes
    return average_reward


In [12]:
def main():
    # Initialize the environment
    env = create_environment()

    # Initialize the Q-table
    q_table = initialize_q_table(env)

    # Set the learning rate and discount factor
    learning_rate = 0.8
    discount_factor = 0.95

    # Initialize exploration rates
    exploration_rates = initialize_exploration_rates()

    # Set the number of episodes for training and testing
    num_training_episodes = 100
    num_test_episodes = 1

    # Train the agent
    train_agent(env, q_table, learning_rate, discount_factor, exploration_rates, num_training_episodes)

    # Evaluate the agent
    average_reward = evaluate_agent(env, q_table, num_test_episodes)
    print(f"Average Reward over {num_test_episodes} test episodes: {average_reward}")

    # Close the environment
    env.close()
main()

Average Reward over 1 test episodes: 0.0


In [37]:
def train_until_goal(env, q_table, learning_rate, discount_factor, exploration_rate, max_episodes):
    for episode in range(max_episodes):
        env.reset()  # Start a new episode
        done = False

        while not done:
            action = np.argmax(q_table[state, :]) if np.random.rand() > exploration_rate else env.action_space.sample()
            new_state, reward, terminated, truncated, _ = env.step(action)  # Take the action

            q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount_factor * np.max(q_table[new_state, :]) - q_table[state, action])
            state = new_state

            if terminated or truncated:  # Check if the episode ended
                done = True
                if terminated and reward == 1.0:  # Check if the goal was reached
                    print(f"Goal reached at episode {episode} in {episode_steps} steps.")
                    return episode

        exploration_rate = max(exploration_rate * 0.99, 0.01)  # Decrement the exploration rate

    return -1  # Return -1 if the goal was not reached within the max episodes

# Usage
desc = ["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
env = gym.make('FrozenLake-v1', desc=desc, is_slippery=False, render_mode='human')  
q_table = np.zeros((env.observation_space.n, env.action_space.n))
learning_rate = 0.8
discount_factor = 0.95
exploration_rate = 1.0
max_episodes = 1000

episode_reached = train_until_goal(env, q_table, learning_rate, discount_factor, exploration_rate, max_episodes)
env.close()
print(f"Goal reached at episode: {episode_reached}")


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [46]:
env.close()

In [39]:
desc = ["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
env = gym.make('FrozenLake-v1', desc=desc, is_slippery=False, render_mode='human')  
env.reset()
print(env.step(0))

(0, 0.0, False, False, {'prob': 1.0})


In [45]:
env.step(2)
#obs, reward, terminated, truncated , info = env.step(action)

(18, 1.0, True, False, {'prob': 1.0})