In [11]:
import gymnasium as gym
import numpy as np 
import matplotlib.pyplot as plt
import random


In [None]:
#monte carlo update with complete average or expectation
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode=None)

#hyper parameters
num_episodes = 100000 #num of epi to train
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.00001
stepsize_alpha = 0.1 

num_states = env.observation_space.n
num_actions = env.action_space.n

#action value matrix
Q = np.zeros((num_states, num_actions))
Policy = np.zeros((num_states))
return_count = np.zeros((num_states, num_actions))

def epsilon_greedy_policy(state, Q, epsilon):
    if random.uniform(0, 1) < epsilon: #explore
        return env.action_space.sample() 
    
    else:
        return np.argmax(Q[state]) #choose greddy
    
for i_episode in range(1, num_episodes + 1):
    episode = [] #to save current episode history
    state, info = env.reset()
    terminated = False
    truncated = False

    #Epsilon decay
    epsilon = max(epsilon_end, epsilon_start - epsilon_decay*i_episode)

    while (not terminated and not truncated):
        action = epsilon_greedy_policy(state, Q, epsilon)
        next_state, reward, terminated, truncated, info = env.step(action)

        #store the state, action, reward
        episode.append((state, action, reward))

        state = next_state

    #monte carlo update
    G = 0 #total return for current episode

    for t in reversed(range(len(episode))):
        state_t, action_t, reward_t = episode[t]

        #update the total retunr
        G = reward_t + gamma * G

        #for every visit counter
        return_count[state_t][action_t] += 1.0

        #update Q values
        current_Q = Q[state_t][action_t]
        Q[state_t][action_t] = current_Q + (1/return_count[state_t][action_t]) * (G - current_Q)
        #Q[state_t][action_t] = current_Q + (stepsize_alpha) * (G - current_Q)

    #progress report
    if i_episode % 1000 == 0:
        print(f"\nEpisode {i_episode}/{num_episodes}, Epsilon: {epsilon:.4f}")    

        Policy = np.argmax(Q, axis=1)

        #Evaluate the learned policy
        print("Evaluating learned policy...")
        total_rewards = 0
        num_eval_episodes = 100

        for _ in range(num_eval_episodes):
            state, info = env.reset()
            terminated = False
            truncated = False
            episode_reward = 0
            while (not terminated and not truncated):
                # For evaluation, use a greedy policy (no exploration)
                action = Policy[state]
                next_state, reward, done, truncated, info = env.step(action)
                episode_reward += reward
                state = next_state

            total_rewards += episode_reward

        print(f"Average reward over {num_eval_episodes} evaluation episodes: {total_rewards / num_eval_episodes:.4f}")
        print("the best policy \n", Policy.reshape(4,4))

env.close()
#best policy
Policy = np.argmax(Q, axis=1)
print("the best policy: ", Policy)

    


Episode 1000/100000, Epsilon: 0.9900
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.2400
the best policy 
 [[0 1 2 0]
 [0 0 1 0]
 [3 2 1 0]
 [0 1 2 0]]

Episode 2000/100000, Epsilon: 0.9800
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.1600
the best policy 
 [[0 1 2 0]
 [0 0 1 0]
 [3 2 1 0]
 [0 1 2 0]]

Episode 3000/100000, Epsilon: 0.9700
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.0500
the best policy 
 [[2 1 2 0]
 [0 0 1 0]
 [3 2 1 0]
 [0 2 2 0]]

Episode 4000/100000, Epsilon: 0.9600
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.1600
the best policy 
 [[1 1 2 0]
 [0 0 1 0]
 [3 1 1 0]
 [0 2 2 0]]

Episode 5000/100000, Epsilon: 0.9500
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.2600
the best policy 
 [[2 1 2 0]
 [0 0 1 0]
 [3 1 0 0]
 [0 2 1 0]]

Episode 6000/100000, Epsilon: 0.9400
Evaluating learned policy...
Average reward over 100

In [23]:
#monte carlo update with fixed stepsize average
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode=None)

#hyper parameters
num_episodes = 100000 #num of epi to train
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.00001
stepsize_alpha = 0.001 

num_states = env.observation_space.n
num_actions = env.action_space.n

#action value matrix
Q = np.zeros((num_states, num_actions))
Policy = np.zeros((num_states))
return_count = np.zeros((num_states, num_actions))

def epsilon_greedy_policy(state, Q, epsilon):
    if random.uniform(0, 1) < epsilon: #explore
        return env.action_space.sample() 
    
    else:
        return np.argmax(Q[state]) #choose greddy
    
for i_episode in range(1, num_episodes + 1):
    episode = [] #to save current episode history
    state, info = env.reset()
    terminated = False
    truncated = False

    #Epsilon decay
    epsilon = max(epsilon_end, epsilon_start - epsilon_decay*i_episode)

    while (not terminated and not truncated):
        action = epsilon_greedy_policy(state, Q, epsilon)
        next_state, reward, terminated, truncated, info = env.step(action)

        #store the state, action, reward
        episode.append((state, action, reward))

        state = next_state

    #monte carlo update
    G = 0 #total return for current episode

    for t in reversed(range(len(episode))):
        state_t, action_t, reward_t = episode[t]

        #update the total retunr
        G = reward_t + gamma * G

        #for every visit counter
        return_count[state_t][action_t] += 1.0

        #update Q values
        current_Q = Q[state_t][action_t]
        # Q[state_t][action_t] = current_Q + (1/return_count[state_t][action_t]) * (G - current_Q)
        Q[state_t][action_t] = current_Q + (stepsize_alpha) * (G - current_Q)

    #progress report
    if i_episode % 1000 == 0:
        print(f"\nEpisode {i_episode}/{num_episodes}, Epsilon: {epsilon:.4f}")    
        
        Policy = np.argmax(Q, axis=1)
        
        #Evaluate the learned policy
        print("Evaluating learned policy...")
        total_rewards = 0
        num_eval_episodes = 100

        for _ in range(num_eval_episodes):
            state, info = env.reset()
            terminated = False
            truncated = False
            episode_reward = 0
            while (not terminated and not truncated):
                # For evaluation, use a greedy policy (no exploration)
                action = Policy[state]
                next_state, reward, done, truncated, info = env.step(action)
                episode_reward += reward
                state = next_state

            total_rewards += episode_reward

        print(f"Average reward over {num_eval_episodes} evaluation episodes: {total_rewards / num_eval_episodes:.4f}")
        print("the best policy \n", Policy.reshape(4,4))

env.close()
#best policy
Policy = np.argmax(Q, axis=1)
print("the best policy: ", Policy)

    


Episode 1000/100000, Epsilon: 0.9900
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.1600
the best policy 
 [[0 3 0 1]
 [2 0 0 0]
 [1 1 0 0]
 [0 2 1 0]]

Episode 2000/100000, Epsilon: 0.9800
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.1600
the best policy 
 [[0 3 0 1]
 [0 0 0 0]
 [1 2 0 0]
 [0 2 3 0]]

Episode 3000/100000, Epsilon: 0.9700
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.1000
the best policy 
 [[2 3 1 1]
 [2 0 1 0]
 [3 2 0 0]
 [0 2 3 0]]

Episode 4000/100000, Epsilon: 0.9600
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.3500
the best policy 
 [[0 3 1 1]
 [0 0 1 0]
 [3 2 0 0]
 [0 2 1 0]]

Episode 5000/100000, Epsilon: 0.9500
Evaluating learned policy...
Average reward over 100 evaluation episodes: 0.2100
the best policy 
 [[2 3 1 1]
 [0 0 1 0]
 [3 2 0 0]
 [0 2 1 0]]

Episode 6000/100000, Epsilon: 0.9400
Evaluating learned policy...
Average reward over 100