In [17]:
import gymnasium as gym
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt

# set seed
SEED = 106

random.seed(SEED)

In [12]:
env = gym.make(
    'FrozenLake-v1', map_name="4x4", is_slippery=True, render_mode = 'ansi'
)
env.reset(seed=SEED)
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [43]:
class DoubleQLearning:
    def __init__(self, env, alpha = 0.1, gamma=0.9, epsilon=0.9):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q1_values = self.init_q_values()
        self.q2_values = self.init_q_values()
        
    def init_q_values(self):
        return {
            (state, action): 0.0 for state in range(self.env.observation_space.n) for action in range(self.env.action_space.n)
        }
        
    def epsilon_greedy(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            return max(
                range(self.env.action_space.n), key= lambda x: self.q1_values[(state, x)] + self.q2_values[(state, x)]
            )
    
    def compute_policy(self, num_of_timesteps, num_of_episodes):
        # Initialize placeholder for rewards
        rewards_per_episode = []
        
        for _ in range(num_of_episodes):
            state, _ = self.env.reset()
            episode_reward = 0
            
            for t in range(num_of_episodes):
                action = self.epsilon_greedy(state)
                
                next_state, reward, done, _, _ = self.env.step(action)
                episode_reward += reward
                
                if random.uniform(0, 1) < 0.5:
                    next_action = max(range(self.env.action_space.n), key=lambda x: self.q1_values[(next_state, x)])
                    self.q1_values[(state, action)] += self.alpha * (reward + self.gamma * self.q2_values[next_state, next_action] - self.q1_values[(state, action)])
                else:
                    next_action = max(range(self.env.action_space.n), key=lambda x: self.q2_values[(next_state, x)])
                    self.q2_values[(state, action)] += self.alpha * (reward + self.gamma * self.q1_values[next_state, next_action] - self.q2_values[(state, action)])
                
                # Move to the next state
                state = next_state
                
                if done:
                    break
            
        rewards_per_episode.append(episode_reward)
    
    def execute_policy(self, max_steps=100):
        state, _ = self.env.reset()
        self.env.render()
        total_reward = 0
        
        for _ in range(max_steps):
            action = max(range(self.env.action_space.n), key=lambda x: self.q1_values[(state, x)] + self.q2_values[(state, x)])
            next_state, reward, done, _, _ = self.env.step(action)
            total_reward += reward
            self.env.render()
            state = next_state
            
            if done:
                break
        
        print(f"Total reward: {total_reward}")
        self.env.close()

In [44]:
# Parameters
alpha = 0.03
gamma = 0.9
epsilon = 0.5
num_of_episodes = 100000
num_of_timesteps = 100

# Initialize the Frozen Lake Environment
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True)
env.reset()

# Create SARSA model
model = DoubleQLearning(env, alpha=alpha, gamma=gamma, epsilon=epsilon)

# Compute policy and store Q-values
rewards_per_episode = model.compute_policy(num_of_timesteps, num_of_episodes)

model.execute_policy()

Total reward: 0.0


In [48]:
# Display the learned policy
policy = np.zeros(env.observation_space.n, dtype=int)
for state in range(env.observation_space.n):
    policy[state] = np.argmax([model.q1_values[(state, a)] for a in range(env.action_space.n)])

print("Learned Policy:")
print(policy.reshape((4, 4)))  # Reshape according to FrozenLake's 4x4 grid

Learned Policy:
[[0 3 1 3]
 [0 0 1 0]
 [1 1 0 0]
 [0 2 1 0]]
