In [3]:
import numpy as np 
import gym
import random

In [4]:
env = gym.make("FrozenLake-v0")
#created the environment

action_size = env.action_space.n
state_size = env.observation_space.n

In [5]:
print("A: ",action_size)
print("S: ", state_size)

A:  4
S:  16


In [7]:
#Step-1: Initializing the Q-table, load all the state values to 0. 
# Columns = Action Rows = states, i.e. 
# table = 4x16
Q_table = np.zeros((state_size, action_size))
print("Q-table: ")
print(Q_table)

Q-table: 
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [15]:
#Creating all the necessary hyperparameters:
#learning rate, episodes, epsilon,discounts

episodes = 10000
steps_per_episodes = 100
Learning_Rate = 0.8
epsilon = 1.0 # probability of exploration is 1 in beginning for doing random exploration
max_epsilon = 1.0
min_epsilon = 0.001 # close to 0, for exploitation
decay_Rate = 0.005  # epsilon will decay or change downwards at this rate
Gamma = 0.95 # discount rate, high because of long term reward desire 

In [17]:
Rewards = [] # list of rewards
# maximum episodes that are possible
for episode in range(1, episodes):
    s = env.reset()  # reset the environment after every episode
    done = False # flag
    Total_Reward = 0  #as per Q-table
    
    for step in range(1,steps_per_episodes):
        # choose an action "A" in current state
           # Chhose a random number for exploration
        trade_off = random.uniform(0,1)            
        
        #if trade_off > epsilon, then do exploitation, by choosing biggest value of Q from table
        if trade_off > epsilon:
            action = np.argmax(Q_table[s,:]) # for that state s, provided above s = env.reset(), i.e., initial position
        else:
            action = env.action_space.sample() # else do the exploration
            
        #observe the returns of the environmetn
        s_, r, d, i = env.step(action)
        
        #update Q-table
        Q_table[s,action] = Q_table[s,action] + Learning_Rate * (r + Gamma * np.max(Q_table[s_, :]) - Q_table[s,action])
        
        Total_Reward += r
        
        s = s_ # this is the newstate
        
        #terminal
        if d == True:
            break
        
    #reduce epsilon because to continue for exploitation and to lessen the exploration
    epsilon= min_epsilon + ( max_epsilon - min_epsilon) * np.exp( - decay_Rate* episode)
    Rewards.append(Total_Reward)
    
print("Score: " + str(sum(Rewards)/episodes))
print("\n")
print("Q-table: ")
print(Q_table)
    
            
        

Score: 0.6165


Q-table: 
[[1.32251700e-01 3.25230020e-02 3.20362997e-02 3.38002831e-02]
 [1.05970684e-02 1.03367661e-02 6.24544342e-03 2.43347480e-01]
 [5.07733282e-03 9.17826566e-03 8.99339918e-03 1.52461480e-01]
 [1.50513144e-04 9.65328874e-03 7.15934141e-03 8.19739694e-02]
 [3.00902258e-01 3.58847305e-02 7.18980540e-04 3.20935716e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.35309494e-02 1.89079533e-06 9.08043616e-07 2.92206966e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.28940039e-04 3.42433525e-02 4.58112717e-02 3.46640430e-01]
 [1.12013316e-03 5.79555860e-01 4.13933372e-03 3.16687571e-03]
 [2.60910902e-01 3.75249409e-03 9.38706212e-06 8.95012020e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.91185529e-02 2.84738145e-02 4.29531459e-01 2.95223022e-02]
 [1.33682092e-01 8.99115486e-01 1.84311722e-01 1.27451991e-01]
 [0.00000000e+00 0.00000000e+

In [20]:
#training is done 
#USe updated Q_table to play frozen lake again

env.reset()

for episode in range(5):
    s = env.reset()
    done = False
    print("Episode: ", episode)
    
    for step in range(steps_per_episodes):
        action = np.argmax(Q_table[s,:]) #take the action having maximum Q-value entry
        s_, r, d, i = env.step(action)
        
        #print only last state of success or failure
        if done == True:
            env.render(mode=True)
            print("Number of steps: ", step)
            break
        s = s_ 
env.close()        
        
            

Episode:  0
Episode:  1
Episode:  2
Episode:  3
Episode:  4
