In [12]:
import numpy as np
import gym
import random
from collections import defaultdict 

In [13]:
class QLearningAgent:
     def __init__(self, env):
         self.env = env
         self.Qs = np.zeros((env.observation_space.n, env.action_space.n))
    
     def train(self, max_episodes = 15000, max_steps = 100, discount_rate = 0.98, learning_rate = 0.1, 
              exploration_rate_max = 1., exploration_rate_min = 0.01, exploration_decay_rate = 0.001):
      
          exploration_rate = 1

          for e in range(max_episodes):
             state = env.reset()
             done = False
             rewards = 0
             for step in range(max_steps):
                exploration_rate_threshold = random.uniform(0,1)
                if exploration_rate_threshold > exploration_rate:
                   action = np.argmax(self.Qs[state,:])
                else:
                   action = env.action_space.sample()

                new_state, reward, done, info = env.step(action)   

                self.Qs[state,action] =  (1-learning_rate) * self.Qs[state,action] + \
                     learning_rate *(reward + discount_rate * np.max(self.Qs[new_state,:]))

                state = new_state
                rewards += reward

                if done == True:
                   break
  
                exploration_rate = exploration_rate_min + (exploration_rate_max - exploration_rate_min) * np.exp(-exploration_decay_rate * e)    
     
          print(self.Qs)

     def act(self, state):
        return np.argmax(self.Qs[state,:]) 

In [14]:
env = gym.make("FrozenLake-v0")

agent = QLearningAgent(env)
agent.train()

[[0.36295411 0.34458475 0.3500776  0.3439362 ]
 [0.21796158 0.27430186 0.17905013 0.31378263]
 [0.27688725 0.27124336 0.26707536 0.28537232]
 [0.09237218 0.14103638 0.16952111 0.27245003]
 [0.38880395 0.32143342 0.349775   0.19199071]
 [0.         0.         0.         0.        ]
 [0.25545054 0.10125162 0.10355923 0.0651357 ]
 [0.         0.         0.         0.        ]
 [0.31486883 0.32164147 0.3191476  0.45115963]
 [0.45537542 0.5510316  0.30487133 0.26751389]
 [0.54708018 0.27617351 0.25197851 0.21362441]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.50528904 0.59160208 0.67554321 0.32814465]
 [0.67944947 0.85210525 0.69588817 0.69378006]
 [0.         0.         0.         0.        ]]


In [15]:
wins = 0
done = False
test_env = gym.make("FrozenLake-v0")

for episode in range(1000):
  state = test_env.reset()
  done = False

  for step in range(100):
     action = agent.act(state)
     new_state, reward, done, info = test_env.step(action) 

     if done == True:
        if reward == 1:
           wins +=1
        break
     state = new_state
        
print("wins ratio ", wins/1000) 

wins ratio  0.732
