In [4]:
import numpy as np
import gym
import random

In [5]:
# Making the game environment
env = gym.make('Taxi-v2')
env.render()

+---------+
|[34;1mR[0m:[43m [0m| : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [6]:
# Initializing
totalActions = env.action_space.n
totalStates = env.observation_space.n
learningRate = 0.1
gamma = 0.8
epsilon = 1           # Exploration vs exploitation
maxEpsilon = 1
decayRate = 0.01      # To reduce gamma with every episode
minEpsilon = 0.01
totalTrainEpisodes = 50000
totalTestEpisodes = 1
totalSteps = 99         # Total steps in every episodes
qTable = np.zeros((totalStates, totalActions))

In [7]:
# For training
for eachEpisode in range(totalTrainEpisodes):
    state = env.reset()
    eachStep = 0
    done = False
    
    for eachStep in range(totalSteps):
        rNumber = random.random()
        action = 0
        if rNumber < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(qTable[state, :])
            
        newState, reward, done, info = env.step(action)
        
        # Updating value in qTable
        qTable[state, action] = qTable[state, action] + learningRate * (reward + gamma * np.max(qTable[newState, :] - qTable[state, action]))
        state = newState
        if done == True:
            break
        
    epsilon = minEpsilon + (maxEpsilon - minEpsilon) * np.exp(-decayRate*eachEpisode)

In [10]:
rewards = []
for eachEpisode in range(totalTestEpisodes):
    state = env.reset()
    done = False
    step = 0
    
    for eachStep in range(totalSteps):
        env.render()
        action = np.argmax(qTable[state, :])
        newState, reward, done, info = env.step(action)
        rewards.append(reward)
        if done == True:
            break
        state = newState
        
env.close()
totalReward = sum(rewards)
print("Average score = " + str(totalReward/totalTestEpisodes))
        

+---------+
|[34;1mR[0m:[43m [0m| : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[42mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|R:[42m_[0m| : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m: : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : :[42m_[0m: : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | :[42m_[0m:[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---