In [8]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

In [9]:
num_training_sessions = 1000  # Number of training sessions
max_episodes = 100  # Maximum number of episodes per training session
num_test_episodes_per_session = 10000  # Number of test episodes per training session

# Define the testing function
def test_agent(Q, num_test_episodes):
    total_successes = 0
    total_steps = 0
    for i in range(num_test_episodes):
        state = env.reset()[0]
        done = False
        steps = 0
        while not done:
            action = np.argmax(Q[state, :])
            state, reward, terminated, truncated, _ = env.step(action)
            steps += 1
            done = terminated or truncated
            if done and reward == 1:
                total_successes += 1
        total_steps += steps
    average_steps = total_steps / num_test_episodes
    return total_successes, average_steps

In [10]:
def has_converged(old_q_table, new_q_table, threshold=0.00025):
    """Check if the Q-table has converged."""
    converged = np.all(np.abs(old_q_table - new_q_table) < threshold)
    return converged

In [11]:
# Initialize environment and parameters
desc = ["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False)

observationSpace = env.observation_space.n
actionSpace = env.action_space.n
q_table = np.random.rand(observationSpace, actionSpace) * 0.1

learning_rate = 0.5
discount_factor = 0.95
exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001
max_episodes = 100

In [12]:
# Define action choice function
def choose_action(state, q_table, exploration_rate):
    if np.random.uniform(0, 1) < exploration_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state, :])
    return action

# Define Q-table update function
def update_q_table(state, action, reward, new_state, q_table, learning_rate, discount_factor):
    best_future_q = np.max(q_table[new_state, :])
    current_q = q_table[state, action]
    new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount_factor * best_future_q)
    q_table[state, action] = new_q

In [13]:
# Training and Testing Sessions
training_times = []
convergence_episodes = []
training_successes = []
testing_successes = []
testing_average_steps = []

for training_session in tqdm(range(num_training_sessions), desc='Training Sessions'):
    q_table = np.random.rand(observationSpace, actionSpace) * 0.1
    previous_q_table = np.copy(q_table)
    session_start_time = time.time()

    for episode in range(max_episodes):
        state = env.reset()[0]
        done = False
        state_visits = {s: 0 for s in range(observationSpace)}  # Track state visits

        while not done:
            action = choose_action(state, q_table, exploration_rate)
            new_state, reward, terminated, truncated, _ = env.step(action)

            # Update state visits count
            state_visits[new_state] += 1

            # Calculate penalty for visiting the same state
            visit_penalty = -0.01 * (2 ** state_visits[new_state])
            
            # Assign rewards and penalties
            if new_state == state:
                reward = visit_penalty
            else:
                if terminated and reward == 0:  # Falling into the ice
                    reward = -0.75
                elif not terminated:
                    reward = 0.0  # Reward for safe move
                reward += visit_penalty

            update_q_table(state, action, reward, new_state, q_table, learning_rate, discount_factor)

            state = new_state
            done = terminated or truncated

        exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay_rate)

        if has_converged(previous_q_table, q_table):
            convergence_episodes.append(episode)
            break
        previous_q_table = np.copy(q_table)

    session_end_time = time.time()
    training_times.append(session_end_time - session_start_time)

    test_success, average_steps = test_agent(q_table, num_test_episodes_per_session)
    testing_successes.append(test_success)
    testing_average_steps.append(average_steps)

# Save results to a DataFrame and CSV
stats_df = pd.DataFrame({
    'Training Session': range(1, num_training_sessions + 1),
    'Time to Convergence (seconds)': training_times,
    'Episodes to Convergence': convergence_episodes,
    'Successes (Testing)': testing_successes,
    'Average Steps (Testing)': testing_average_steps
})
csv_filename = 'training_testing_statistics.csv'
stats_df.to_csv(csv_filename, index=False)
print(f"Statistics saved to {csv_filename}")


Training Sessions: 100%|██████████| 1000/1000 [12:34<00:00,  1.33it/s]

Statistics saved to training_testing_statistics.csv





In [14]:
env.close()