In [1]:
import gym
import numpy as np

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
# the number of actions possible at every state
env.action_space.n

4

In [4]:
# 4 actions = Left, Right, Top, Bottom

In [5]:
# an observation is a way to describe the current state of the environment
env.observation_space

Discrete(16)

In [6]:
# A 4*4 grid with 16 tiles (S, F, H, G)
# S = Start
# F = Frozen
# H = Hole
# G = Goal

In [7]:
# the learning rate
alpha = 0.4

In [8]:
# discount factor for future rewards
gamma = 0.999

In [9]:
# the key is the current state
# current state is when we're in one of the 16 tiles
# initialize Q-values of the 4 possible next actions for each state here it is 1
q_table = dict([(x, [1, 1, 1, 1]) for x in range(16)])

In [10]:
q_table

{0: [1, 1, 1, 1],
 1: [1, 1, 1, 1],
 2: [1, 1, 1, 1],
 3: [1, 1, 1, 1],
 4: [1, 1, 1, 1],
 5: [1, 1, 1, 1],
 6: [1, 1, 1, 1],
 7: [1, 1, 1, 1],
 8: [1, 1, 1, 1],
 9: [1, 1, 1, 1],
 10: [1, 1, 1, 1],
 11: [1, 1, 1, 1],
 12: [1, 1, 1, 1],
 13: [1, 1, 1, 1],
 14: [1, 1, 1, 1],
 15: [1, 1, 1, 1]}

In [11]:
# choose the next action based on the current observation
def choose_action(observ):
    # the observation will be the one with the maximum Q-value
    return np.argmax(q_table[observ])

In [12]:
# populate the Q-table using 10000 episodes
for i in range(10000):
    
    observ = env.reset()
    
    # start exploring the environment with a random action from the current state
    action = choose_action(observ)
    
    prev_observ = None
    prev_action = None
    
    t = 0
    
    # run for 2500 time steps in each episode:
    for t in range(2500):
        # unlock below code to see all episodes
        # env.render()
        
        observ, reward, done, info = env.step(action)
        
        action = choose_action(observ)
        
        if not prev_observ is None:
            q_old = q_table[prev_observ][prev_action]
            q_new = q_old
            
            if done:
                q_new += alpha * (reward - q_old)
            else:
                q_new += alpha * (reward + gamma * q_table[observ][action] - q_old)
            
            # update the state table for the previous action with the new values calculated after the current action
            new_table = q_table[prev_observ]
            new_table[prev_action] = q_new
            
            q_table[prev_observ] = new_table
            
        prev_observ = observ
        prev_action = action
        
        if done:
            # print("Episode {} finished after {} timesteps with r={}.".format(i, t, reward))
            break

In [13]:
new_table

[0.3885698603675607,
 0.9940427822791436,
 0.44129983807316486,
 0.4350252352588013]

In [14]:
# Q-values of each action in every state using SARSA
q_table

{0: [0.7614958015917299,
  0.23435976860831992,
  0.2106409770396447,
  0.24294246931801805],
 1: [0.1296, 0.16093207705627355, 0.16258607905115027, 0.7708794918206666],
 2: [0.2168285711614884,
  0.19981899738890785,
  0.22855382559673737,
  0.7811100112389834],
 3: [0.18256698189425305,
  0.2216786319497455,
  0.21465607206803722,
  0.7780041213681337],
 4: [0.8073849231816422,
  0.21656240491977638,
  0.15385676321977987,
  0.15883991379284587],
 5: [1, 1, 1, 1],
 6: [0.007309034812358188,
  0.007671710462346887,
  0.46198365951212916,
  0.0058140609052250115],
 7: [1, 1, 1, 1],
 8: [0.12951705599999996,
  0.158605573750126,
  0.20625488437985176,
  0.8712593354800227],
 9: [0.16342121302317736,
  0.9036823542204303,
  0.1435495072402577,
  0.12067599507915475],
 10: [0.657592345520561,
  0.06960112303700865,
  0.07625077989367843,
  0.07462363645343087],
 11: [1, 1, 1, 1],
 12: [1, 1, 1, 1],
 13: [0.3590033347302257,
  0.35985599999999995,
  0.9372999834898323,
  0.2618212195253908