In [1]:
import numpy as np
import gym
import random
from collections import defaultdict 

In [2]:
class SarsaAgent:
     def __init__(self, env):
         self.env = env
         self.Qs = np.zeros((env.observation_space.n, env.action_space.n))
    
     def getAction(self, state, exploration_rate):
         exploration_rate_threshold = random.uniform(0,1)
         if exploration_rate_threshold > exploration_rate:
            action = np.argmax(self.Qs[state,:])
         else:
            action = env.action_space.sample()
         return action

     def train(self, max_episodes = 15000, max_steps = 100, discount_rate = 0.98, learning_rate = 0.1, 
              exploration_rate_max = 1., exploration_rate_min = 0.01, exploration_decay_rate = 0.001):
      
          exploration_rate = 1

          for e in range(max_episodes):
             state = env.reset()
             done = False
             action = self.getAction(state, exploration_rate)

             for step in range(max_steps):
                new_state, reward, done, _ = env.step(action) 
                if done == True:
                   self.Qs[state, action] = self.Qs[state, action] + \
                     learning_rate *(reward - self.Qs[state, action])
                   break
                new_action = self.getAction(new_state, exploration_rate) 

                self.Qs[state, action] = self.Qs[state, action] + \
                    learning_rate *(reward + discount_rate * self.Qs[new_state, new_action] - self.Qs[state, action])

                state = new_state
                action = new_action

                if done == True:
                   break
  
                exploration_rate = exploration_rate_min + (exploration_rate_max - exploration_rate_min) * np.exp(-exploration_decay_rate * e)    
     
          print(self.Qs)

     def act(self, state):
        return np.argmax(self.Qs[state,:]) 

In [3]:
env = gym.make("FrozenLake-v0")

agent = SarsaAgent(env)
agent.train()

[[0.38202568 0.3042228  0.28493387 0.30210653]
 [0.18341057 0.16928523 0.15947324 0.32604241]
 [0.17625244 0.17959803 0.18313258 0.24410041]
 [0.1084419  0.10262964 0.08574089 0.22852448]
 [0.40598877 0.25074169 0.24046969 0.27965886]
 [0.         0.         0.         0.        ]
 [0.22574918 0.08483758 0.0916693  0.07821771]
 [0.         0.         0.         0.        ]
 [0.25362223 0.30115139 0.282066   0.46718855]
 [0.3190801  0.51572764 0.36332983 0.21459444]
 [0.55505282 0.26951307 0.20798134 0.2380352 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.34189075 0.39539594 0.68634864 0.45037999]
 [0.59582279 0.79885226 0.63181664 0.62331125]
 [0.         0.         0.         0.        ]]


In [4]:
wins = 0
done = False
test_env = gym.make("FrozenLake-v0")

for episode in range(1000):
  state = test_env.reset()
  done = False

  for step in range(100):
     action = agent.act(state)
     new_state, reward, done, info = test_env.step(action) 

     if done == True:
        if reward == 1:
           wins +=1
        break
     state = new_state
        
print("wins ratio ", wins/1000) 

wins ratio  0.763
