In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [2]:
env = gym.make('FrozenLake-v1', is_slippery=False)
slippery_env = gym.make('FrozenLake-v1', is_slippery=True)

## Training

`epsilon`: with this probability you do a random action  
`1 - epsilon`: with this probability you do the best action

In [3]:
def play_game(env, policy, epsilon):
    done = False
    sar_list = []
    state = env.reset()
    
    while not done:
        if np.random.random() < epsilon:
            action = np.random.choice(env.action_space.n)
        else:
            action = policy[state]
            
        new_state, reward, done, _ = env.step(action)
        
        sar_list.append([state, action, reward])
        state = new_state
    
    return sar_list

In [4]:
def mc_control(env, epochs, epsilon=0.25):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    Q = np.zeros((num_states,num_actions))
    returns = {(s,a): [] for s in range(num_states) for a in range(num_actions)}
    policy = np.random.choice(num_actions,size=num_states)
    r_list = []
    
    for ep in range(epochs): # training episodes
        # generate episode using policy
        sar_list = play_game(env, policy, epsilon)
        
        # adjust policy accordingly
        reward_sum = 0
        for state, action, reward in reversed(sar_list):
            reward_sum += reward
            returns[(state, action)].append(reward_sum)
            Q[state, action] = np.mean(returns[(state, action)])
            if max(Q[state]) > 0:
                policy[state] = np.argmax(Q[state])
        
        r_list.append(reward_sum)
        if (ep+1) % int(epochs/10) == 0:
            print(f"Epoch {ep+1}, Mean reward: {np.mean(r_list)}")
            r_list = []
    
    return policy

In [10]:
policy = mc_control(env, 10000, epsilon=0.25)
policy

Epoch 1000, Mean reward: 0.533
Epoch 2000, Mean reward: 0.726
Epoch 3000, Mean reward: 0.751
Epoch 4000, Mean reward: 0.708
Epoch 5000, Mean reward: 0.729
Epoch 6000, Mean reward: 0.726
Epoch 7000, Mean reward: 0.742
Epoch 8000, Mean reward: 0.722
Epoch 9000, Mean reward: 0.733
Epoch 10000, Mean reward: 0.72


array([1, 0, 1, 0, 1, 3, 1, 0, 2, 2, 1, 3, 0, 2, 2, 0])

In [6]:
slippery_policy = mc_control(slippery_env, 10000, epsilon=0.15)
slippery_policy

Epoch 1000, Mean reward: 0.09
Epoch 2000, Mean reward: 0.114
Epoch 3000, Mean reward: 0.111
Epoch 4000, Mean reward: 0.12
Epoch 5000, Mean reward: 0.159
Epoch 6000, Mean reward: 0.26
Epoch 7000, Mean reward: 0.305
Epoch 8000, Mean reward: 0.315
Epoch 9000, Mean reward: 0.308
Epoch 10000, Mean reward: 0.316


array([0, 3, 0, 0, 0, 0, 2, 3, 3, 1, 0, 0, 2, 2, 1, 1])

## Testing

In [7]:
def test_policy(policy, env):
    done = False
    state = env.reset()
    while not done:
        state, reward, done, _ = env.step(policy[state])
        env.render()

    if reward:
        print("Reached GOAL!!!")
    else:
        print("Fell into hole ):")

In [11]:
test_policy(policy, env)

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Reached GOAL!!!


In [9]:
test_policy(slippery_policy, slippery_env)

  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Reached GOAL!!!
