In [30]:
#import required libraries
import gymnasium as gym
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt

# set seed
SEED = 106

In [31]:
env = gym.make(
    'FrozenLake-v1', map_name="4x4", is_slippery=True, render_mode = 'ansi'
)
env.reset(seed=SEED)
print(env.render())

In [32]:
class QLearning:
    def __init__(self, env, alpha = 0.01, gamma = 0.9, epsilon=0.9):
        """
        Initialize the Q-Learning Agent agent.
        
        :param env: The environment to learn from.
        :type env: gym.Env
        :param alpha: The learning rate.
        :type alpha: float
        :param gamma: The discount factor.
        :type gamma: float
        :param epsilon: The exploration rate for the epsilon-greedy policy.
        :type epsilon: float
        """
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_values = self.init_q_values()
        
    def init_q_values(self):
        """
        Initialize the Q-values to zero for all state-action pairs.
        
        :return: A dictionary with state-action pairs as keys and Q-values as values.
        :rtype: dict
        """
        q = {}
        for state in range(self.env.observation_space.n):
            for action in range(self.env.action_space.n):
                q[(state, action)] = 0.0
        return q
    
    def epsilon_greedy(self, state):
        """
        Select an action using the epsilon-greedy policy.
        
        :param state: The current state.
        :type state: int
        :return: The selected action.
        :rtype: int
        """
        if random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return max(
                range(self.env.action_space.n), key=lambda x: self.q_values[(state, x)]
            )
    
    def compute_policy(self, num_of_timesteps, num_of_episodes):
        """
        Learn the policy by running SARSA for a specified number of episodes and timesteps.
        
        :param num_of_timesteps: The number of timesteps in each episode.
        :type num_of_timesteps: int
        :param num_of_episodes: The number of episodes to run.
        :type num_of_episodes: int
        :return: DataFrame containing Q-values for each episode.
        :rtype: pd.DataFrame
        """
        q_values_df = pd.DataFrame()

        for i in range(num_of_episodes):
            # Initialize states
            state, _ = self.env.reset()
            
            for t in range(num_of_timesteps):
                # Select the action using epsilon-greedy policy
                action = self.epsilon_greedy(state)
                
                # Take the action and observe the next state and reward
                next_state, reward, done, _, _ = self.env.step(action)
                
                
                # Select next action for the next state using -greedy policy
                next_action = np.argmax(
                    [self.q_values[(next_state, a)] for a in range(env.action_space.n)]
                )
                
                # Update Q-value
                self.q_values[(state, action)] += self.alpha * (
                    reward + self.gamma * self.q_values[(next_state, next_action)] - self.q_values[(state, action)]
                )
                
                # Move to the next state and action
                state = next_state
                
                if done:
                    break
            
            # Store Q-values in DataFrame
            q_values_df = pd.concat([q_values_df, pd.DataFrame(self.q_values, index=[i])])
        
        return q_values_df

In [33]:
# Parameters
alpha = 0.03
gamma = 0.9
epsilon = 0.5
num_of_episodes = 100000
num_of_timesteps = 100

# Initialize the Frozen Lake Environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True)
env.reset()

# Create SARSA model
model = QLearning(env, alpha=alpha, gamma=gamma, epsilon=epsilon)

# Compute policy and store Q-values
q_values_df = model.compute_policy(num_of_timesteps, num_of_episodes)

In [None]:
# Calculate average Q-values over episodes
avg_q_values = q_values_df.mean(axis=1)

# Plot the average Q-values
plt.figure(figsize=(12, 8))
plt.plot(avg_q_values.index, avg_q_values.values, label='Average Q-value', color='r')
plt.xlabel('Episode')
plt.ylabel('Q-value')
plt.title('Average Q-value Evolution Over Episodes through Q-Learning')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Display the learned policy
policy = np.zeros(env.observation_space.n, dtype=int)
for state in range(env.observation_space.n):
    policy[state] = np.argmax([model.q_values[(state, a)] for a in range(env.action_space.n)])

print("Learned Policy:")
print(policy.reshape((4, 4)))  # Reshape according to FrozenLake's 4x4 grid

In [None]:
# Print the learned Q-values
print("Learned Q-values:")
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        print(f"Q[{state}, {action}] = {model.q_values[(state, action)]:.2f}")

In [None]:
# Get learned Policy
policy = policy.reshape((4, 4))

env = gym.make('FrozenLake-v1', render_mode='rgb_array')

# Function to run an episode
def run_episode(env, policy):
    state, _ = env.reset()
    done = False
    frames = []
    
    while not done:
        frames.append(env.render())
        action = policy[state]
        state, reward, done, _, _ = env.step(action)
    
    return frames

# Run an episode
frames = run_episode(env, policy)

# Display the animation
fig, ax = plt.subplots()
img = ax.imshow(frames[0])

def animate(i):
    img.set_data(frames[i])
    return (img,)

from matplotlib.animation import FuncAnimation
anim = FuncAnimation(fig, animate, frames=len(frames), interval=500, blit=True)
plt.show()

env.close()