In [122]:
#Imports
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import pandas as pd
import numpy as np
import random
import sklearn
import matplotlib.pyplot as plt

In [123]:
# Initialize the environment with a custom map
desc = ["SFFF", "FHHH", "FFFF", "HFHF", "FFGF"]
#env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False, render_mode='human') #render mode human set map
#env = gym.make('FrozenLake-v1', desc=desc, map_name="5x5", is_slippery=False) #render mode none human for fast training
env = gym.make('FrozenLake-v1', desc = generate_random_map(size=5), is_slippery = False, render_mode = 'human') #render mode human non set map 
#env = gym.make('FrozenLake-v1', desc = generate_random_map(size=5), is_slippery = False) #render mode non human random map for training

# Initialize the Q-table
observationSpace = env.observation_space.n
actionSpace = env.action_space.n
q_table = np.random.rand(observationSpace, actionSpace) * 0.1


# Learning parameters
learning_rate = 0.8       # Alpha - how much we update our Q-value with the new information we gain
discount_factor = 0.95    # Gamma - how much importance we give to future rewards
exploration_rate = 1.0    # Epsilon - probability of choosing a random action
max_exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001
max_episodes = 100

# Initialize a variable to count successful goal reaches
goal_reaches = 0




In [124]:
def choose_action(state, q_table, exploration_rate):
    if np.random.uniform(0, 1) < exploration_rate:
        action = env.action_space.sample()  # Explore: select a random action
    else:
        action = np.argmax(q_table[state, :])  # Exploit: select the action with the highest Q-value
    return action


In [125]:
def update_q_table(state, action, reward, new_state, q_table, learning_rate, discount_factor):
    # Q-Learning formula
    best_future_q = np.max(q_table[new_state, :])
    current_q = q_table[state, action]
    new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount_factor * best_future_q)
    q_table[state, action] = new_q


In [126]:
for episode in range(max_episodes):
    state = env.reset()[0]  # Extract the state identifier
    done = False

    # Reset state visits count for the new episode
    state_visits = {s: 0 for s in range(observationSpace)}

    while not done:
        action = choose_action(state, q_table, exploration_rate)
        new_state, reward, terminated, truncated, _ = env.step(action)

        # Update state visits count
        state_visits[new_state] += 1

        # Calculate penalty for visiting the same state
        visit_penalty = -0.01 * (2 ** state_visits[new_state])

        # Check if the agent stayed in the same state
        if new_state == state:
            reward = visit_penalty
        else:
            # Check for falling into the ice
            if terminated and reward == 0:
                reward = -0.5  # Penalty for falling into the ice
            elif not terminated:
                reward = 0.1  # Reward for a safe move
            reward += visit_penalty  # Add penalty for repeated visits

        # Update the Q-table
        update_q_table(state, action, reward, new_state, q_table, learning_rate, discount_factor)
        
        if terminated and reward == 1:
            goal_reaches += 1

        state = new_state
        done = terminated or truncated

        # Decay the exploration rate
        exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay_rate)

        # Print statements for debugging
        print(f"Episode: {episode}, State: {state}, Action: {action}, Reward: {reward}, New State: {new_state}, Visits: {state_visits[new_state]}")

    # Optional: Add code here to track and print progress, e.g., every 100 episodes


Episode: 0, State: 5, Action: 1, Reward: 0.08, New State: 5, Visits: 1
Episode: 0, State: 0, Action: 3, Reward: 0.08, New State: 0, Visits: 1
Episode: 0, State: 5, Action: 1, Reward: 0.060000000000000005, New State: 5, Visits: 2
Episode: 0, State: 0, Action: 3, Reward: 0.060000000000000005, New State: 0, Visits: 2
Episode: 0, State: 5, Action: 1, Reward: 0.020000000000000004, New State: 5, Visits: 3
Episode: 0, State: 0, Action: 3, Reward: 0.020000000000000004, New State: 0, Visits: 3
Episode: 0, State: 5, Action: 1, Reward: -0.06, New State: 5, Visits: 4
Episode: 0, State: 0, Action: 3, Reward: -0.06, New State: 0, Visits: 4
Episode: 0, State: 5, Action: 1, Reward: -0.22, New State: 5, Visits: 5
Episode: 0, State: 0, Action: 3, Reward: -0.22, New State: 0, Visits: 5
Episode: 0, State: 1, Action: 2, Reward: 0.08, New State: 1, Visits: 1
Episode: 0, State: 2, Action: 2, Reward: 0.08, New State: 2, Visits: 1
Episode: 0, State: 3, Action: 2, Reward: 0.08, New State: 3, Visits: 1
Episode: 

KeyboardInterrupt: 

In [127]:
env.close()