In [1]:


import gym
import random
import numpy as np

#Red — 0 , Green — 1, Yellow — 2, and Blue — 3 for pick up
streets = gym.make("Taxi-v3").env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4


##Red — 0 , Green — 1, Yellow — 2, and Blue — 3 for pick up
#Each state is defined by a 4 entries tuple: （taxi_row, taxi_col, passenger_location, destination)
initial_state = streets.encode(2, 3, 2, 0)
streets.s = initial_state
streets.render()

#State Space:  25 possible taxi positions, 5 possible locations of the passenger
# 25*5*4 = 500 


#Action space:6 --> N,S,E,W, DROP-OFF, PICKUP
#Rewards: CORRECT FINAL DEST. +20, STEP -1, INCORRECT PICK/DROP -10

q_table = np.zeros([streets.observation_space.n, streets.action_space.n]) # 500 , 6
#q_table.size
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
total_reward_G = 0
learning_rate = 0.1
discount_factor = 0.5
exploration = 0.1
epochs = 1000

for taxi_run in range(epochs): #Start training (the agent plays the number of epochs)
    state = streets.reset()
    done = False
    total_reward_G = 0
    steps=0
    while not done:#each epoch/play contains this number of actions, starting from pickup a passenger until drop-off
        steps +=1
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Return the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action) # Do the above action
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        # see RL-2 PPT file --- slide# 5
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        total_reward_G += reward
        q_table[state, action] = new_q
        #streets.render()        
        state = next_state

    if (taxi_run) % 100 ==0:   
         print('taxi_run {} Total Rewards: {} Steps: {}'.format(taxi_run,total_reward_G,steps))
         #streets.render() 


+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

taxi_run 0 Total Rewards: -1242 Steps: 561
taxi_run 100 Total Rewards: -343 Steps: 256
taxi_run 200 Total Rewards: -289 Steps: 229


In [4]:
from IPython.display import clear_output
from time import sleep
import gym
import random
import numpy as np

streets = gym.make("Taxi-v3").env
streets.render()

discount_factors = [0.3, 0.5, 0.9]
avg_rewards = []
avg_steps = []

for discount_factor in discount_factors:
    q_table = np.zeros([streets.observation_space.n, streets.action_space.n])  # 500, 6
    total_rewards = []
    total_steps = []

    epochs = 10000

    for taxi_run in range(epochs):
        state = streets.reset()
        done = False
        total_reward_G = 0
        steps = 0

        while not done:
            steps += 1
            random_value = random.uniform(0, 1)

            if random_value < exploration:
                action = streets.action_space.sample()
            else:
                action = np.argmax(q_table[state])

            next_state, reward, done, info = streets.step(action)

            prev_q = q_table[state, action]
            next_max_q = np.max(q_table[next_state])
            new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
            q_table[state, action] = new_q
            state = next_state

            total_reward_G += reward

        if taxi_run % 1000 == 0:
            print('Taxi Run {} Total Rewards: {} Steps: {}'.format(taxi_run, total_reward_G, steps))

        total_rewards.append(total_reward_G)
        total_steps.append(steps)

    avg_reward = np.mean(total_rewards)
    avg_step = np.mean(total_steps)

    avg_rewards.append(avg_reward)
    avg_steps.append(avg_step)

    lengths = []
    for tripnum in range(1, 11):
        state = streets.reset()
        done = False
        trip_length = 0

        while not done and trip_length < 25:
            action = np.argmax(q_table[state])
            next_state, reward, done, info = streets.step(action)
            clear_output(wait=True)
            print("Trip number " + str(tripnum) + " Step " + str(trip_length))
            print(streets.render(mode='ansi'))
            sleep(.2)
            state = next_state
            trip_length += 1
        lengths.append(trip_length)

    avg_len = np.mean(lengths)
    print(f"step={taxi_run}")
    print(f"Discount_factor = {discount_factor}")
    print(f"Average Rewards: {avg_reward}")
    print(f"Average Steps: {avg_step}")
    print(f"Average Trip Length: {avg_len}")

print("Comparison of discount factors:")
print("Discount_factor = 0.3\tDiscount_factor = 0.5\tDiscount_factor = 0.9")
print(f"REWARD\t {avg_rewards[0]}\t\t\t{avg_rewards[1]}\t\t\t{avg_rewards[2]}")
print(f"STEPS\t{ avg_steps[0]}\t\t\t{avg_steps[1]}\t\t\t{avg_steps[2]}")
if taxi_run % 1000 == 0:
    print('Taxi Run {} Total Rewards: {} Steps: {}'.format(taxi_run, total_reward_G, steps))



Trip number 10 Step 11
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

step=9999
Discount_factor = 0.9
Average Rewards: -9.6149
Average Steps: 22.865
Average Trip Length: 13.7
Comparison of discount factors:
Discount_factor = 0.3	Discount_factor = 0.5	Discount_factor = 0.9
REWARD	 -25.0493			-17.2015			-9.6149
STEPS	35.2061			29.2222			22.865
