Import GYM and compute random baseline

In [14]:
import gym

env = gym.make('Taxi-v2')
state = env.reset()
env.render()

total_reward = 0
done = False
while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    total_reward += reward
    env.render()

print('Total reward:', total_reward)

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | :

# Goal 1

Determine number of possible actions and observation states

In [15]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(6)
Observation Space: Discrete(500)


Create matrix for Q-table

In [16]:
import numpy as np

q_table = np.zeros((500, 6))

In [21]:
def update_q_table(q_table, reward, last_state, last_action, state, alpha, gamma):
    current_value = q_table[last_state][last_action]
    exp_future_reward = q_table[state].max()
    q_table[last_state][last_action] = (current_value + alpha * 
                                        (reward + gamma * exp_future_reward - current_value))
    
    return q_table

In [87]:
epsilon = 1
alpha = 0.05
gamma = 0.95

q_table = np.zeros((500, 6))
for episode in range(10000):
    state = env.reset()
    for step in range(1000):
        if np.random.random() < epsilon:
            action = np.random.choice(6)
        else:
            action = np.argmax(q_table[state])
        last_state = state
        state, reward, done, info = env.step(action)
        q_table = update_q_table(q_table, reward, last_state, action, state, alpha, gamma)

        if done:
            break

    epsilon -= .9/10000

In [88]:
episodes = 1000
rewards = []
max_steps = 99

for episode in range(episodes):
    state = env.reset()  # Assuming you already have env created as above
    total_rewards = 0
    
    for step in range(max_steps):
        action = np.argmax(q_table[state])  # TODO your policy here!
        state, reward, done, info = env.step(action)
        total_rewards += reward
        if done:
            break
    rewards.append(total_rewards)        

print('Average score over time:', sum(rewards) / episodes)

Average score over time: 8.513


# Goal 2

Determine efficiency of different numbers of training episodes

In [58]:
import time

epsilon = 1
alpha = 0.05
gamma = 0.95
n_episodes = [10000, 20000]

for n in n_episodes:
    start_time = time.time()
    q_table = np.zeros((500, 6))
    for episode in range(n):
        state = env.reset()
        for step in range(1000):
            if np.random.random() < epsilon:
                action = np.random.choice(6)
            else:
                actions = np.argmax(q_table[state])
            last_state = state
            state, reward, done, info = env.step(action)
            q_table = update_q_table(q_table, reward, last_state, action, state, alpha, gamma)

            if done:
                break

        epsilon -= .9/n
    episodes = 1000
    rewards = []
    max_steps = 99

    for episode in range(episodes):
        state = env.reset()  # Assuming you already have env created as above
        total_rewards = 0
    
        for step in range(max_steps):
            action = np.argmax(q_table[state])  # TODO your policy here!
            state, reward, done, info = env.step(action)
            total_rewards += reward
            if done:
                break
        rewards.append(total_rewards)        

    print('Average score over time:', sum(rewards) / episodes)
    print("--- %s seconds ---" % (time.time() - start_time))

Average score over time: 8.462
--- 99.19117069244385 seconds ---
Average score over time: -536.508
--- 183.213951587677 seconds ---


Something is wrong in the loop... don't understand where it is so I will define separate functions. I will use 10,000 training episodes because for time's sake.

In [82]:
def train_policy(alpha, gamma, epsilon_decay):
    epsilon = 1
    q_table = np.zeros((500, 6))
    for episode in range(10000):
        state = env.reset()
        for step in range(1000):
            if np.random.random() < epsilon:
                action = np.random.choice(6)
            else:
                actions = np.argmax(q_table[state])
            last_state = state
            state, reward, done, info = env.step(action)
            q_table = update_q_table(q_table, reward, last_state, action, state, alpha, gamma)

            if done:
                break

        epsilon -= epsilon_decay/10000
    print('--------------\nAlpha, Gamma, Epsilon Decay Rate:', [alpha, gamma, epsilon_decay])
    return q_table

In [83]:
def test_policy(q_table):
    episodes = 1000
    rewards = []
    max_steps = 99
    
    for episode in range(episodes):
        state = env.reset()  # Assuming you already have env created as above
        total_rewards = 0
    
        for step in range(max_steps):
            action = np.argmax(q_table[state])  # TODO your policy here!
            state, reward, done, info = env.step(action)
            total_rewards += reward
            if done:
                break
        rewards.append(total_rewards)        
    
    print('Average score over time:', sum(rewards) / episodes)

In [84]:
#alphas = [0.25, 0.05, 0.75, 0.1]
#gammas = [0.98, 0.95, 0.9, 0.85]
epsilon_decay_rates = [0.9, 0.95]
alpha = 0.95
gamma = 0.05
epsilon
#for gamma in gammas:
#    for alpha in alphas:
for epsilon_decay in epsilon_decay_rates:
    q_table = train_policy(alpha, gamma, epsilon_decay)
    test_policy(q_table)

--------------
Alpha, Gamma, Epsilon Decay Rate: [0.95, 0.05, 0.9]
Average score over time: -8.885
--------------
Alpha, Gamma, Epsilon Decay Rate: [0.95, 0.05, 0.95]
Average score over time: -8.879


In [85]:
q_table = train_policy(alpha, gamma, 0.9)

--------------
Alpha, Gamma, Epsilon Decay Rate: [0.95, 0.05, 0.9]


In [86]:
test_policy(q_table)

Average score over time: -8.549
