In [1]:
import numpy as np
import gym

In [2]:
frozen_lake = gym.make('FrozenLake-v0')

In [3]:
from utils.layer import softmax
from utils.policy import epsilon_greedy

In [4]:
def every_visit_monte_carlo(env, episodes, epoch, alpha, gamma):
    """
    env : the environment, e.g. FronzenLake-v0 from OpenAI Gym
    policy: policy to be evaluated
    alpha : learning rate
    gamma : discount factor
    """

    for e in range(episodes):
        nA = env.action_space.n
        nS = env.observation_space.n
        q_table = np.zeros((nS, nA))
        n_table = np.zeros(1, nS)
        state = env.reset()
        a = []
        for i in range(epoch):
            action = epsilon_greedy(softmax(q_table[state]), epsilon=0.3)
            state_next, immediate_reward, done, info = env.step(action)
            if immediate_reward == 1:
                a.append("asu")
            if not done:
                q_table[state][action] += (alpha *(immediate_reward - q_table[state][action]))
                state = state_next
                
    return a, q_table

In [5]:
monte_carlo(frozen_lake, episodes=1000, epoch=100, alpha=1, gamma=3)

([], array([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]))

In [26]:
def first_visit_monte_carlo(env, episodes, epoch, alpha, gamma):
    """
    env : the environment, e.g. FronzenLake-v0 from OpenAI Gym
    policy: policy to be evaluated
    alpha : learning rate
    gamma : discount factor
    """

    for e in range(episodes):
        nA = env.action_space.n
        nS = env.observation_space.n
        q_table = np.zeros((nS, nA))
        n_table = np.zeros((nS, ))
        state = env.reset()
        a = []
        for i in range(epoch):
            action = epsilon_greedy(softmax(q_table[state]), epsilon=0.3)
            state_next, immediate_reward, done, info = env.step(action)
            if immediate_reward == 1:
                a.append("asu")
            if not done and n_table[state]<1 :
                q_table[state][action] += (alpha *(immediate_reward - q_table[state][action]))
                state = state_next
                
    return a, q_table

In [27]:
first_visit_monte_carlo(frozen_lake, episodes=1000, epoch=100, alpha=1, gamma=2)

([], array([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]))