In [1]:
import numpy as np

# Define Frozen Lake map
desc = [
    "SFFF",
    "FHHH",
    "FFFF",
    "HFHF",
    "FFGF"
]

# Define action mappings
action_mapping = {
    0: 'LEFT',
    1: 'DOWN',
    2: 'RIGHT',
    3: 'UP'
}

# Convert the map to a numpy array
map_array = np.asarray(desc, dtype='c')

# Define Q-learning parameters
gamma = 0.99  # Discount factor
alpha = 0.1   # Learning rate
epsilon = 0.1 # Exploration rate

# Initialize Q-table
num_states = map_array.size
num_actions = len(action_mapping)
Q_table = np.zeros((num_states, num_actions))

# Helper function to get the state index from the map
def get_state_index(map_array):
    return np.ravel_multi_index(np.where(map_array == b'S'), map_array.shape)

# Helper function to choose action using epsilon-greedy policy
def choose_action(state_index):
    if np.random.rand() < epsilon:
        return np.random.choice(num_actions)
    else:
        return np.argmax(Q_table[state_index])

# Helper function to update Q-values
def update_Q_value(state_index, action_index, reward, next_state_index):
    max_next_Q = np.max(Q_table[next_state_index])
    Q_table[state_index, action_index] += alpha * (reward + gamma * max_next_Q - Q_table[state_index, action_index])

# Main Q-learning loop
num_episodes = 1000
for episode in range(num_episodes):
    state_index = get_state_index(map_array)
    done = False
    total_reward = 0

    while not done:
        action_index = choose_action(state_index)
        action = action_mapping[action_index]

        # Simulate action
        if action == 'LEFT':
            next_state_index = state_index - 1
        elif action == 'DOWN':
            next_state_index = state_index + map_array.shape[1]
        elif action == 'RIGHT':
            next_state_index = state_index + 1
        elif action == 'UP':
            next_state_index = state_index - map_array.shape[1]

        # Retrieve reward
        reward = 0
        if map_array.ravel()[next_state_index] == b'F':
            reward = 0
        elif map_array.ravel()[next_state_index] == b'H':
            reward = -1
        elif map_array.ravel()[next_state_index] == b'G':
            reward = 1

        update_Q_value(state_index, action_index, reward, next_state_index)

        total_reward += reward
        state_index = next_state_index

        if map_array.ravel()[state_index] in [b'H', b'G']:
            done = True

    if episode % 100 == 0:
        print("Episode:", episode, "Total Reward:", total_reward)
        print("Q-table:")
        print(Q_table)

print("Q-learning training complete.")
print("Final Q-table:")
print(Q_table)


Episode: 0 Total Reward: 1
Q-table:
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.1 0.  0.  0. ]]
Episode: 100 Total Reward: 1
Q-table:
[[ 9.89664547e-01  0.00000000e+00  4.26250838e-02  1.37325947e-02]
 [ 4.26387749e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 7.22412761e-02 -1.00000000e-01  0.00000000e+00  0.00000000e+00]
 [ 8.36894582e-03  0.00000000e+00  3.99110279e-05  0.00000000e+00]
 [ 7.65969222e-04  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.0000