# Q learning to play Taxi-v2
## Import dependancies
#### credits to github: simoninithomas/Deep_reinforcement_learning_course

In [2]:
import numpy as np
import gym
import random

## Create the environment
### Taxi v2

In [3]:
env = gym.make('Taxi-v2')
env.render()

+---------+
|R: | : :[35m[43mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



## Create Q table and initalize it

In [4]:
action_size =  env.action_space.n
print('Action size : ', action_size)

state_size = env.observation_space.n
print('State size : ', state_size)

Action size :  6
State size :  500


In [5]:
qtable = np.zeros((state_size,action_size))
qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Hyperparameters creation

In [6]:
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.7 
# gamma = discounting rate
gamma = 0.618

'''   Exploration parameters
eplsilon =  exploration rate
max_epsilon = starting epsilon value
min_epsilon = minimum exploaration probablility
decay_rate = exponential decay factor
'''
epsilon = 1.0

max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

## The Q algorithm
![alt text](./images/q_learning_pseudo.png)

In [10]:
for episode in range(total_episodes):
    #Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        # Choose action (a) in current world state (s)
        # first we randomize a number
        
        exp_exp_tradeoff = random.uniform(0,1)
        
        # if this is greater that epsilon --> exploitation mode (choose largest Q value)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state, :])
        
        # otherwise do a radnom choice
        else:
            action = env.action_space.sample()
            
        # take action (a) and observe new state & reward 
        new_state, reward, done, info = env.step(action)
        
        # update Q table based on the bellman equation
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state,:]) - qtable[state,action])
        
        state = new_state
        
        if done == True:
            break
        
    episode+=1
        
    #calculate new epsilon
    
    epsilon = min_epsilon + (max_epsilon -min_epsilon)*np.exp(-decay_rate*episode)

## Use Q table to play Taxi

In [13]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    #print("****************************************************")
    #print("EPISODE ", episode)

    for step in range(max_steps):
        #UNCOMMENT IT IF YOU WANT TO SEE THE AGENT PLAYING
        env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

+---------+
|R:[43m [0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : : : : |
|