# Autonomous Taxi using Q-Learning

This gym environment is taken from : https://gym.openai.com/envs/Taxi-v3/

## Importing Packages

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random

## Creating enviornment and initial test run

In [2]:
env = gym.make('Taxi-v3')

In [3]:
episodes = 10
for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        score += reward
        clear_output(wait=True)
    print('Episode: {}\nScore: {}'.format(episode, score))
    
env.close()

Episode: 9
Score: -686


## Creating a Q-Table and training

In [4]:
action = env.action_space.n
state = env.observation_space.n

q_table = np.zeros((state, action))

In [5]:
# Parameters for Q-Learning
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

rewards_all_episodes = []


In [6]:
# Q-Learning Algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        #Exploration vs Exploitation trade-off
        exploration_threshold = random.uniform(0,1)
        if exploration_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        # Update Q-Table
        q_table[state,action] = q_table[state, action] * (1-learning_rate ) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        state = new_state
        rewards_current_episode += reward
        if done == True:
            break
        
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    rewards_all_episodes.append(rewards_current_episode)

print('Training Finished')

Training Finished


In [18]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.51249885,  1.82229163, -2.81585608,  1.12909804,  9.6220697 ,
        -7.01877938],
       [ 5.02158897,  5.61864581,  6.19684409,  4.2373696 , 14.11880599,
         0.4547978 ],
       ...,
       [-1.49064784,  7.77125232, -1.53117712, -1.49905914, -5.64057387,
        -6.51585909],
       [-2.97455615, -1.63184398, -3.01330534, -2.91744934, -9.8713243 ,
        -9.61355829],
       [ 2.91408672,  1.35361724,  0.81453568, 18.21393511, -1.35870977,
        -2.22819197]])

In [7]:
# Colculate and print average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000

print('Average per thousand episodes')
for r in rewards_per_thousand_episodes:
    print(count , " : ", str(sum(r/1000)))
    count += 1000

Average per thousand episodes
1000  :  -247.9499999999999
2000  :  -37.467000000000056
3000  :  2.013999999999995
4000  :  6.003999999999974
5000  :  6.714999999999968
6000  :  6.973999999999977
7000  :  7.397999999999969
8000  :  7.630999999999966
9000  :  7.309999999999969
10000  :  7.41199999999996


In [8]:
# Visualize Agent
import time

for episode in range(4):
    state = env.reset()
    done = False
    print('Episode is: ', str(episode))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.4)
        action = np.argmax(q_table[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 20:
                print('***Reached Goal***')
                time.sleep(2)
                clear_output(wait=True)
            else:
                print('***Failed***')
                time.sleep(2)
                clear_output(wait=True)
            break
        
        state = new_state
env.close()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
***Reached Goal***
