In [20]:
import gym
import torch
env = gym.make("FrozenLake-v1", render_mode="human")
n_state = env.observation_space.n
print(n_state)
n_action = env.action_space.n
print(n_action)

16
4


In [21]:
env.reset()

(0, {'prob': 1})

In [22]:
env.render() # renders the environment 

In [24]:
new_state, reward, is_done, truncated, info = env.step(1)
env.render()

In [25]:
print(new_state)
print(reward)
print(is_done)
print(info)

1
0.0
False
{'prob': 0.3333333333333333}


In [37]:
def run_episode(env, policy):
    state = env.reset()[0]
    total_reward = 0
    is_done = False
    while not is_done:
        action = policy[state].item()
        state, reward, is_done, trunc, info = env.step(action)
        total_reward += reward
        if is_done:
            break
    return total_reward

In [39]:
n_episode = 10
total_rewards = []
for episode in range(n_episode):
    random_policy = torch.randint(high=n_action, size=(n_state,))
    # print(f'The random policy is {random_policy}')
    total_reward = run_episode(env, random_policy)  # In every episode, a new policy is sampled, and the episode runs with a discrete action associated with each state. 
    total_rewards.append(total_reward)

print('Average total reward under random policy: {}'.format(sum(total_rewards) / n_episode))


Average total reward under random policy: 0.0


In [42]:
i=0
while True:
    print(f"{i} .. ",end = '')
    random_policy = torch.randint(high=n_action, size=(n_state,))
    total_reward = run_episode(env, random_policy)
    if total_reward == 1:
        best_policy = random_policy
        break
    i+=1

0 .. 1 .. 2 .. 3 .. 4 .. 5 .. 6 .. 7 .. 8 .. 9 .. 10 .. 11 .. 12 .. 13 .. 14 .. 15 .. 16 .. 17 .. 18 .. 19 .. 20 .. 21 .. 22 .. 23 .. 24 .. 25 .. 26 .. 27 .. 28 .. 29 .. 30 .. 31 .. 32 .. 33 .. 34 .. 35 .. 36 .. 37 .. 38 .. 39 .. 40 .. 41 .. 42 .. 43 .. 44 .. 45 .. 46 .. 

Even though the policy was successful to reach the goal in the 46 th episode, that does not mean it will always result in success. \
For example, see below. \
The reason this happens is that even though the action is taken deterministically, the same action might not end up been taken, because of slippery ice. \
Direct quote - "in FrozenLake, the movement direction is only partially dependent on the chosen action."

In [43]:
total_rewards = []
for episode in range(n_episode):
    total_reward = run_episode(env, best_policy)
    total_rewards.append(total_reward)

print('Average total reward under random search policy: {}'.format(sum(total_rewards) / n_episode))

Average total reward under random search policy: 0.0


In [44]:
print(env.env.P[6])

{0: [(0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 10, 0.0, False)], 1: [(0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 10, 0.0, False), (0.3333333333333333, 7, 0.0, True)], 2: [(0.3333333333333333, 10, 0.0, False), (0.3333333333333333, 7, 0.0, True), (0.3333333333333333, 2, 0.0, False)], 3: [(0.3333333333333333, 7, 0.0, True), (0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 5, 0.0, True)]}


Key : [0,1,2,3,4] \
Value :  a list of movements after taking an action. \
For eg. 0 : [(0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 10, 0.0, False)] \
    (transformation probability), (new state) (rewards), (is done) \
This means if action 0 is chosen by our algorithm, the system chooses 0 with 1/3 probability. If state 0 is chosen, then the next step is 2, etc. Also, this happens only if the agent is in state '6'.

In [46]:
print(env.env.P[5]) # vs in state 5, when our algo selects 0, it takes that action with 100% certainty

{0: [(1.0, 5, 0, True)], 1: [(1.0, 5, 0, True)], 2: [(1.0, 5, 0, True)], 3: [(1.0, 5, 0, True)]}
