In [1]:
import numpy as np 
import gym
import random

In [5]:
env = gym.make("Taxi-v2")
#created the environment

action_size = env.action_space.n
state_size = env.observation_space.n
print("A: ",action_size)
print("S: ", state_size)

A:  6
S:  500


In [6]:
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[43mY[0m| : |[35mB[0m: |
+---------+



In [7]:
#Step-1: Initializing the Q-table, load all the state values to 0. 
# Columns = Action Rows = states, i.e. 
# table = 4x16
Q_table = np.zeros((state_size, action_size))
print("Q-table: ")
print(Q_table)

Q-table: 
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [8]:
#Creating all the necessary hyperparameters:
#learning rate, episodes, epsilon,discounts

episodes = 30000
steps_per_episodes = 100
Learning_Rate = 0.8
epsilon = 1.0 # probability of exploration is 1 in beginning for doing random exploration
max_epsilon = 1.0
min_epsilon = 0.001 # close to 0, for exploitation
decay_Rate = 0.005  # epsilon will decay or change downwards at this rate
Gamma = 0.60 

In [9]:
Rewards = [] # list of rewards
# maximum episodes that are possible
for episode in range(1, episodes):
    s = env.reset()  # reset the environment after every episode
    done = False # flag
    Total_Reward = 0  #as per Q-table
    
    for step in range(1,steps_per_episodes):
        # choose an action "A" in current state
           # Chhose a random number for exploration
        trade_off = random.uniform(0,1)            
        
        #if trade_off > epsilon, then do exploitation, by choosing biggest value of Q from table
        if trade_off > epsilon:
            action = np.argmax(Q_table[s,:]) # for that state s, provided above s = env.reset(), i.e., initial position
        else:
            action = env.action_space.sample() # else do the exploration
            
        #observe the returns of the environmetn
        s_, r, d, i = env.step(action)
        
        #update Q-table
        Q_table[s,action] = Q_table[s,action] + Learning_Rate * (r + Gamma * np.max(Q_table[s_, :]) - Q_table[s,action])
        
        Total_Reward += r
        
        s = s_ # this is the newstate
        
        #terminal
        if d == True:
            break
        
    #reduce epsilon because to continue for exploitation and to lessen the exploration
    epsilon= min_epsilon + ( max_epsilon - min_epsilon) * np.exp( - decay_Rate* episode)
    Rewards.append(Total_Reward)
    
print("Score: " + str(sum(Rewards)/episodes))
print("\n")
print("Q-table: ")
print(Q_table)
    
            
        

Score: 6.147233333333333


Q-table: 
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.26683852  -2.12731514  -2.25556128  -2.17959391  -1.870144
  -10.773504  ]
 [ -2.22967154  -1.60617397  -1.89438024  -1.45656494  -0.7504
  -11.14469766]
 ...
 [ -1.71392      0.41490682  -1.71392     -1.79897337 -10.23743931
  -10.53696   ]
 [ -2.42750568  -2.27995821  -2.40605492  -2.39801168 -11.2635435
  -10.96549073]
 [ -1.47456     -1.344       -1.4528      10.99996979  -9.984
   -9.984     ]]


In [12]:
#training is done 
#USe updated Q_table to play frozen lake again

env.reset()
r = []
for episode in range(50):
    s = env.reset()
    done = False
    total_reward = 0
    print("Episode: ", episode)
    
    for step in range(steps_per_episodes):
        action = np.argmax(Q_table[s,:]) #take the action having maximum Q-value entry
        s_, r, d, i = env.step(action)
        total_reward += r
        #print only last state of success or failure
        if done == True:
            r.append(total_reward)
            env.render(mode=True)
            print("Number of steps: ", step)
            break
        s = s_ 
env.close()        
            

Episode:  0
Episode:  1
Episode:  2
Episode:  3
Episode:  4
Episode:  5
Episode:  6
Episode:  7
Episode:  8
Episode:  9
Episode:  10
Episode:  11
Episode:  12
Episode:  13
Episode:  14
Episode:  15
Episode:  16
Episode:  17
Episode:  18
Episode:  19
Episode:  20
Episode:  21
Episode:  22
Episode:  23
Episode:  24
Episode:  25
Episode:  26
Episode:  27
Episode:  28
Episode:  29
Episode:  30
Episode:  31
Episode:  32
Episode:  33
Episode:  34
Episode:  35
Episode:  36
Episode:  37
Episode:  38
Episode:  39
Episode:  40
Episode:  41
Episode:  42
Episode:  43
Episode:  44
Episode:  45
Episode:  46
Episode:  47
Episode:  48
Episode:  49
