In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
np.random.seed(2)
N_STATES = 6        # the length of the 1-D world (includes treasure)
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9       # greedy police
ALPHA = 0.1         # learning rate
GAMMA = 0.9         # discount factor
MAX_EPISODES = 13   # maximum episodes
FRESH_TIME = 0.3    # fresh time for one move

In [3]:
def build_q_table(N_STATES=N_STATES, ACTIONS=ACTIONS):  # q_table row:states(int)/col:action(pd)
    table = pd.DataFrame( np.zeros((N_STATES, len(ACTIONS))), columns=ACTIONS)
    return table

def choose_action(state, q_table):              # int, pd
    state_actions = q_table.iloc[state, :]      # select the current state row
    if np.random.uniform()>EPSILON or (state_actions==0).all():
        # rand>0.9 or all the elements in the rows are 0(initial)
        action_name = np.random.choice(ACTIONS) # random action (non-greedy action)
    else:
        action_name = state_actions.idxmax()    # choose the action with max Q (greedy action)
    return action_name

def get_env_feedback(state, action):
    reward = 0
    if action == 'right':            # move right
        if state == N_STATES - 2:    # state at env_list[-2]
            next_state, reward = N_STATES-1, 1
        else:
            next_state = state + 1
    else:                            # move left
        if state == 0:
            next_state = state       # reach the wall
        else:
            next_state = state - 1
    return next_state, reward

def update_env(state, episode, step_count):
    env_list = ['-']*(N_STATES-1) + ['T']   # ['-', '-', '-', '-', '-', 'T']
    if state == N_STATES-1:
        print('\r' + ''.join(env_list))
        print( 'Episode {}: total_steps = {}'.format(episode+1,step_count) ) # interaction
    else:
        env_list[state] = 'o'
        print('\r' + ''.join(env_list), end='') # interaction. let str-list to string
        time.sleep(FRESH_TIME)

def train():
    q_table = build_q_table()
    for episode in range(MAX_EPISODES):
        action_list = []
        state = 0                     # now_state (0~N_STATES-1)
        is_terminated = False
        while not is_terminated:
            action = choose_action(state, q_table)
            next_state, reward = get_env_feedback(state, action)
            estimate_q = q_table.loc[state, action]             # get estimate_q
            if next_state != N_STATES-1:                        # if not finished, real_q(gamma)
                real_q = reward + GAMMA * q_table.iloc[next_state, :].max()
            else:
                real_q = reward                                 # if finished, real_q = R
                is_terminated = True
            q_table.loc[state, action] += ALPHA * (real_q - estimate_q)  # update Q
            state = next_state                                           # move to next state
            action_list.append(action)
            update_env(state, episode, len(action_list))
    return q_table, action_list

In [4]:
q_table, action_list = train()
print('\nQ-table:\n', q_table)
print('action_list: ', action_list)

-----T
Episode 1: total_steps = 38
-----T
Episode 2: total_steps = 22
-----T
Episode 3: total_steps = 9
-----T
Episode 4: total_steps = 5
-----T
Episode 5: total_steps = 7
-----T
Episode 6: total_steps = 5
-----T
Episode 7: total_steps = 5
-----T
Episode 8: total_steps = 5
-----T
Episode 9: total_steps = 5
-----T
Episode 10: total_steps = 5
-----T
Episode 11: total_steps = 5
-----T
Episode 12: total_steps = 7
-----T
Episode 13: total_steps = 5

Q-table:
        left     right
0  0.000000  0.004320
1  0.000000  0.025005
2  0.000030  0.111241
3  0.000000  0.368750
4  0.027621  0.745813
5  0.000000  0.000000
action_list:  ['right', 'right', 'right', 'right', 'right']
