In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("FrozenLake-v0")


In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n

In [4]:
action_size


4

In [5]:
qtable = np.zeros((state_size,action_size))

In [6]:
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [7]:
total_episode = 15000
learning_rate = 0.8
max_steps = 99
gamma = 0.95

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005


In [17]:
rewards = [] //we start with a blank array for the rewards for each state

for episode in range(total_episode):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    //For each step, a random number is generated between 0 and 1
    for step in range(max_steps):
        exp_exp_tradeoff = random.uniform(0,1)
        
        //If the random number is greater than epsilon, we take the value obtained from the Q-network for that state
        //The maximum Q value i.e. the most profitable action is taken for that set from the set of all available actions
        if exp_exp_tradeoff>epsilon:
            action = np.argmax(qtable[state,:])
        
        //If the random number is less than epsilon, then we use a random action with random torque parameters
        //This is the exploratory part where it takes new actions to come up with newer techniques to win the game
        else:
            action = env.action_space.sample()
        
        
        //The Q-network is updated based on the formula in the last section
        new_state,reward,done,info = env.step(action)
        qtable[state,action] = qtable[state,action] + learning_rate * (reward + gamma*np.max(qtable[new_state,:])
                                                                       - qtable[state,action]) 
        
        total_rewards += reward
        //Reward is added for each step based on its performance in that epoch
        state = new_state
        
        if done==True:
            break
        
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    //Epsilon slowly decays exponentially to encourage more Q value exploitation and less exploration 
    //as the network is much more smarter after some epochs
    rewards.append(total_rewards)
    
print("Score over time : " + str(sum(rewards)/total_episode))
print(qtable)
            


Score over time : 0.4694
[[2.48762356e-01 3.17011728e-02 2.21979390e-02 3.15880975e-02]
 [7.93497610e-03 3.10224526e-03 3.49179599e-03 2.82343002e-01]
 [2.14259799e-03 4.97787486e-03 4.83766633e-04 1.02871493e-02]
 [2.41956047e-03 2.92496774e-03 5.67008345e-05 9.64627592e-03]
 [2.39058738e-01 1.94961818e-02 1.38714922e-02 6.05481074e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.52797377e-06 9.32548396e-06 8.84636742e-02 4.48647759e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.48886153e-02 3.77919497e-02 2.02190926e-02 2.13302312e-01]
 [9.24862655e-05 5.31210475e-01 1.78473361e-02 1.20974103e-02]
 [4.76431841e-04 1.34613673e-03 1.28398393e-01 1.93780534e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [6.57494189e-02 1.34330501e-01 5.62093373e-01 6.26176897e-02]
 [1.77274803e-01 9.63993525e-01 2.80158883e-01 3.13504656e-01]
 [0.00000000e+00 0.00000000e+0

In [16]:
env.reset()
for episode in range(15):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)
    
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        new_state , reward, done, info = env.step(action)
        
        if done:
            env.render()
            
            print("Steps : ", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Steps :  72
****************************************************
EPISODE  1
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Steps :  87
****************************************************
EPISODE  2
****************************************************
EPISODE  3
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Steps :  13
****************************************************
EPISODE  4
****************************************************
EPISODE  5
****************************************************
EPISODE  6
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Steps :  69
****************************************************
EPISODE  7
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Steps :  81
****************************************************
EPISODE  8
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Steps :  45
****************************************************
EPISODE  9
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
Steps :  7
****