In [161]:
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from tqdm import tqdm
import time

In [162]:
desc = ["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
#env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False, render_mode = "human")
env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False)

In [163]:
observation_space = env.observation_space.n
action_space = env.action_space.n
q_table = np.random.rand(observation_space, action_space) * 0.1
learning_rate = 0.8
discount_factor = 0.95
exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001
max_train_episodes = 100
max_test_episodes = 10000
num_iterations = 1000
goal_reaches = 0


In [164]:
def choose_action(state):
    if np.random.uniform(0, 1) < exploration_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state, :])
    return action


In [165]:
def update_q_table(state, action, reward, new_state):
    best_future_q = np.max(q_table[new_state, :])
    current_q = q_table[state, action]
    new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount_factor * best_future_q)
    q_table[state, action] = new_q
    #print (new_q)


In [166]:
train_successrates = []
train_episodes = []
train_time = []
test_successrates = []
test_steps = []

In [167]:
for test in tqdm(range(num_iterations), desc='Training Progress'):    
    converged = False
    episode = 0
    numSuccesses = 0
    goal_reaches = 0
    steps = 0
    q_table = np.random.rand(observation_space, action_space) * 0.1
    start_time = time.time()  # Start time of training
    while not converged and episode < max_train_episodes:
        episode += 1
        state = env.reset()[0]
        done = False
        state_visits = np.zeros(env.observation_space.n)

        while not done:
            
            action = choose_action(state)
            new_state, reward, terminated, truncated, _ = env.step(action)
            
            state_visits[new_state] += 1
            visit_penalty = -0.01 * (2 ** state_visits[new_state])
            
            if new_state == state:
                reward = visit_penalty
            else:
                # Check for falling into the ice
                if terminated and reward == 0:
                    reward = -0.75  # Penalty for falling into the ice
                    #print('ded')
                if terminated and reward == 1:
                    goal_reaches += 1
                elif not terminated:
                    reward = -0.1  # Reward for a safe move
                reward += visit_penalty  # Add penalty for repeated visits
            
            prev_Q = q_table.copy()

            update_q_table(state, action, reward, new_state)

            if np.sum(np.abs(q_table - prev_Q)) < 0.00025:
                converged = True
                break
        
            state = new_state
            done = terminated or truncated
            
            exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay_rate)

    end_time = time.time()  # End time of training
    training_time = end_time - start_time  # Calculate training time in seconds
        
    train_time.append(training_time)
    trainSuccessRate = numSuccesses/max_train_episodes
    train_successrates.append(trainSuccessRate)
    train_episodes.append(episode)
    # print(f"Episode {episode}/{max_train_episodes} complete. Goal reached {goal_reaches} times.")
    exploration_rate = 0
    for iteration in range(max_test_episodes):
        episode += 1
        state = env.reset()[0]
        done = False

        while not done:
            steps += 1
            action = choose_action(state)
            new_state, reward, terminated, truncated, _ = env.step(action)
            if new_state == state:
                reward = visit_penalty
            else:
                if terminated and reward == 1:
                    goal_reaches += 1
        
            state = new_state
            done = terminated or truncated

    averageSteps = steps/max_test_episodes
    testSuccessRate = numSuccesses/max_test_episodes
    test_successrates.append(testSuccessRate)
    test_steps.append(averageSteps)



Training Progress:  78%|███████▊  | 777/1000 [49:07<23:22,  6.29s/it]  

: 

In [None]:
# np.savetxt("q_table.csv", q_table)

In [None]:
# q_table = np.loadtxt("q_table.csv")

In [None]:
env.close()

In [None]:
pd.DataFrame({'Train Episodes': train_episodes, 'Train Success Rates': train_successrates, 'Train Time': train_time, 'Test Steps': test_steps, 'Test Success Rates': test_successrates})