<a href="https://colab.research.google.com/github/aksevenli/Pytorch-Reinforcement-Learning/blob/master/frozenlake_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FrozenLake Environment:


*   S: The starting loation
*   G: The goal location, which terminates an episode
*   F: The frozen tile, which is a walkable location
*   H: The hole location, which terminates an episode

Four actions:


*   moving left    (0)
*   moving down    (1)
*   moving right   (2)
*   moving up      (3)

Reach the goal location: rewards + 1, 0 otherwise

In [0]:
import gym
import torch
import pyvirtualdisplay

env = gym.make("FrozenLake-v0")
n_state = env.observation_space.n
n_action = env.action_space.n

In [31]:
env.reset()

0

In [32]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [35]:
new_state, reward, is_done, info = env.step(2)
env.render()

  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [0]:
# Define function that simulates a FrozenLake episode given apolicy and returns the total reward
def run_episode(env, policy):
    state = env.reset()
    total_reward = 0
    is_done = False
    while not is_done:
        action = policy[state].item()
        state, reward, is_done, info = env.step(action)
        total_reward += reward
        if is_done:
            break
    return total_reward

In [28]:
# Play with random policy
n_episode = 1000
total_rewards = []
for episode in range(n_episode):
    random_policy = torch.randint(high=n_action, size=(n_state, ))
    total_reward = run_episode(env, random_policy)
    total_rewards.append(total_reward)

print('Average total reward under random policy: {}'.format(sum(total_rewards) / n_episode))

Average total reward under random policy: 0.023


In [29]:
# Play with random search policy
while True:
    random_policy = torch.randint(high=n_action, size=(n_state, ))
    total_reward = run_episode(env, random_policy)
    if total_reward == 1:
        best_policy = random_policy
        break

print(best_policy)

tensor([0, 3, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3, 2, 1, 2])


In [30]:
total_rewards = []
for episode in range(n_episode):
    total_reward = run_episode(env, best_policy)
    total_rewards.append(total_reward)

print('Average total reward under random search policy: {}'.format(sum(total_rewards) / n_episode))

Average total reward under random search policy: 0.601
