In [10]:
import gymnasium as gym
import random
from IPython.display import clear_output
%config NotebookApp.iopub_msg_rate_limit=10000
import time

In [11]:
# Set the description of the environment
# S=StartingPoint_Safe, F=FrozenSurface_Safe, H=Hole_NotSafe, G=Goal
desc=["SFF",
      "FHF",
      "FGF"]

# Create the environment with the given description
env = gym.make('FrozenLake-v1', desc=desc, map_name="3x3", is_slippery=False, render_mode="human")

# Get initial observation and information from the environment
observation, info = env.reset()

# Get the size of the maze
maze_rows = len(desc)
maze_cols = len(desc[0])

# Get the number of states and actions in the environment
num_states = env.observation_space.n
num_actions = env.action_space.n

In [12]:
# Hyperparameters
epsilon_value = 0.05
learning_rate = 0.5

# Initialize the Q-table
q_table = {}
current_state = 0

In [13]:
# Reset the Q-table
def reset_table():
    global q_table, current_state
    q_table = {state: [0] * num_actions for state in range(num_states)}
    current_state = 0

In [14]:
# Calculate possible moves from a given state
def calculate_possible_moves(state):
    possible_moves = []
    if (state + 1) % maze_cols != 0:
        possible_moves.append(2)
    if (state + 1) % maze_cols != 1 and not state == 0:
        possible_moves.append(0)
    if state > (maze_cols - 1) and not state == 0:
        possible_moves.append(3)
    if state < ((maze_rows * maze_cols) - maze_cols):
        possible_moves.append(1)
    return possible_moves


In [15]:
# Choose the next step using epsilon-greedy policy
def next_step(state):
    max_q_value = max(q_table[state])
    if max_q_value >= epsilon_value:
        next_move = q_table[state].index(max_q_value)
    else:
        possible_moves = calculate_possible_moves(state)
        next_move = random.choice(possible_moves)
    return next_move

In [16]:
# Update the Q-table based on the observed reward
def update_table(action, next_state, reward):
    global current_state
    updated_q_value = q_table[current_state][action] + learning_rate * (reward + max(q_table[next_state]) - q_table[current_state][action])
    q_table[current_state][action] = updated_q_value
    current_state = next_state


In [17]:
# Convergence criteria
convergence_thresh = 0.125

# Main training loop
max_episodes = 25
current_episode = 1
converged = False

reset_table()
env.reset()
start_time = time.time()
while current_episode <= max_episodes:
    if converged:
        break
        
    action = next_step(current_state)
    observation, reward, terminated, _, _ = env.step(action)
    
    next_state = observation
    update_table(action, next_state, reward)

    if terminated or converged:
        observation, info = env.reset()
        if not converged:
            current_episode += 1

    if converged:
        end_time = time.time()

    clear_output(wait=True)
    print("Episode: " + str(current_episode) + "/" + str(max_episodes))
    print("Time: " + str(round(time.time()-start_time, 3)) + " sec")
    print("Q-Table:")
    for state, values in q_table.items():
        print(str(state) + ": " + str(values))
    
if converged:
    duration = end_time - start_time
    print(str(round(duration, 3)) + " seconds to converge")
else:
    print("No convergence")

Episode: 26/25
Time: 37.643 sec
Q-Table:
0: [0, 2.5950136482715607, 0.0, 0]
1: [0.1875, 0.0, 0.0, 0]
2: [0.0, 0.0, 0, 0]
3: [0, 2.880781501531601, 0.0, 0.0]
4: [0.0, 0.0, 0, 0]
5: [0.0, 0.0, 0, 0.0]
6: [0, 0, 3.166448548436165, 0.0]
7: [0.0, 0, 0.03125, 2.309262603521347]
8: [0.5, 0, 0, 0]
No convergence


: 