# Teach a Taxi to pick up and drop off passengers at the right locations with Reinforcement Learning

In [1]:
pip install gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
#pip install gym[pygame]
!pip install pygame

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 6.1 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2


In [4]:
import gym
import numpy as np
import pickle, os

In [5]:
env = gym.make("Taxi-v3")

In [12]:
state = env.reset()

In [13]:
state

327

In [14]:
env.observation_space

Discrete(500)

In [15]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[35mB[0m: |
+---------+



In [16]:
state

327

In [17]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[35mB[0m: |
+---------+



<h1>Possible Actions</h1>

down (0), up (1), right (2), left (3), pick-up (4), and drop-off (5)

In [60]:
n_states = env.observation_space.n
n_actions = env.action_space.n

In [61]:
n_actions

6

In [62]:
env.env.s = 134

In [63]:
env.render()

+---------+
|R: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)


In [64]:
env.step(1)

(34, -1, False, {'prob': 1.0})

In [65]:
env.render()

+---------+
|R:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)


<h1>How good does behaving completely random do?</h1>

In [66]:
state = env.reset()
counter = 0
g = 0
reward = None

In [69]:
# This randomly generates an action (0 to 5 actions avlb)
env.action_space.sample()

2

In [76]:
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward

In [77]:
print("Solved in {} Steps with a total reward of {}".format(counter,g))

Solved in 1678 Steps with a total reward of -6616


## Let's look at just one episode and see how the Q values change after each step using the formula below

In [78]:
Q = np.zeros([n_states, n_actions])

In [79]:
episodes = 1
G = 0
alpha = 0.6

In [81]:
for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    firstState = state
    print("Initial State = {}".format(state))
    while reward != 20:
        action = np.argmax(Q[state]) #Greedy algorithm
        state2, reward, done, info = env.step(action)
        Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) 
        G += reward
        state = state2

Initial State = 366


## Let's run over multiple episodes so that we can converge on a optimal policy

In [82]:
episodes = 2000
rewardTracker = []

In [83]:
G = 0
alpha = 0.618

In [84]:
for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
        
    if episode % 100 == 0:
        print('Episode {} Total Reward: {}'.format(episode,G))

Episode 100 Total Reward: -61
Episode 200 Total Reward: -9
Episode 300 Total Reward: 10
Episode 400 Total Reward: 6
Episode 500 Total Reward: 5
Episode 600 Total Reward: 5
Episode 700 Total Reward: 14
Episode 800 Total Reward: 6
Episode 900 Total Reward: 12
Episode 1000 Total Reward: 7
Episode 1100 Total Reward: 5
Episode 1200 Total Reward: 9
Episode 1300 Total Reward: 7
Episode 1400 Total Reward: 9
Episode 1500 Total Reward: 7
Episode 1600 Total Reward: 11
Episode 1700 Total Reward: 6
Episode 1800 Total Reward: 8
Episode 1900 Total Reward: 7
Episode 2000 Total Reward: 5


## Now that we have learned the optimal Q Values we have developed a optimal policy and have no need to train the agent anymore

In [85]:
state = env.reset()
done = None

In [86]:
while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : 

In [87]:
with open("smartTaxi_qTable.pkl", 'wb') as f:
    pickle.dump(Q, f)

In [88]:
with open("smartTaxi_qTable.pkl", 'rb') as f:
    Qtest = pickle.load(f)