In [23]:
import gym
import numpy as np
import time
import matplotlib.pyplot as plt

# Create FrozenLake environment
env = gym.make('FrozenLake-v0')

STATES = env.observation_space.n
ACTIONS = env.action_space.n
# Explore environment
print(STATES)
print(ACTIONS)

16
4


In [24]:
env.reset()
for _ in range(5):
    env.render()
    action = env.step(env.action_space.sample()) # take a random action
    # F: frozen, H: hole, S: start, G: goal
    print("New state: {}, Reward: {}, Done: {}, Info: {}".format(
        action[0], action[1], action[2], action[3]))
env.close()


[41mS[0mFFF
FHFH
FFFH
HFFG
New state: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
New state: 4, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
New state: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
New state: 4, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
New state: 0, Reward: 0.0, Done: False, Info: {'prob': 0.3333333333333333}


In [33]:
Q = np.zeros((STATES, ACTIONS))  # create a matrix with all 0 values 
print(Q)

# Define hyperparameters
episodes = 2
max_steps = 100
learning_rate = 0.8
gamma = 0.95

# Define exploration parameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [36]:
rewards = []

for episode in range(episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        if np.random.uniform(0, 1) > epsilon:
            action = np.argmax(Q[state,:])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)
        
        print(f"new_state: {new_state}, reward: {reward}, done: {done}, info: {info}")

        Q[state, action] = Q[state, action] + learning_rate * (reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
        total_rewards = total_rewards + reward
        state = new_state
        
        # If done finish episode
        if done == True:
            break
        
    episode += 1
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/episodes))
print(Q)
print(epsilon)

new_state: 4, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 4, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 5, reward: 0.0, done: True, info: {'prob': 0.3333333333333333}
new_state: 4, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 0, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 0, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 0, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 0, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 1, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 2, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 6, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 2, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 6, reward: 0.0, done: False, info: {'prob': 0.3333333333333333}
new_state: 10, reward: 0.0

In [37]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            break
        state = new_state
env.close()

****************************************************
EPISODE  0

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
****************************************************
EPISODE  1

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
****************************************************