In [10]:
import numpy as np
import pandas as pd
import time

In [11]:
np.random.seed(1)

In [12]:
N_STATES = 8
ACTIONS = ['left', 'right'] 
EPSILON = 0.9
ALPHA = 0.1 
GAMMA = 0.9 
MAX_EPISODES = 40
FRESH_TIME = 0.1

In [13]:
def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states,len(actions))),
        columns=actions
    )
    return table

In [14]:
def choose_action(state, q_table):
    state_actions = q_table.iloc[state,:]
    if (np.random.uniform()>EPSILON) or (state_actions.all()==0):
        action_name = np.random.choice(ACTIONS)
    else:
        action_name = state_actions.idxmax()
    return action_name

In [15]:
def get_env_feedback(S,A):
    if A == 'right':
        if S == N_STATES - 2:
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:
        R = 0
        if S == 0:
            S_ = S
        else:
            S_ = S - 1
            
    return S_, R

In [16]:
def update_env(S, episode, step_counter):
    env_list = ['-']*(N_STATES-1) + ['T']
    if S == 'terminal':
        interaction = 'Episode %s: total_step = %s' %(episode, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r',end='')  
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')#
        time.sleep(FRESH_TIME)

In [17]:
def rl():
    q_table = build_q_table(N_STATES,ACTIONS)
    for episode in range(MAX_EPISODES):
        stepcounter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, stepcounter)
        while not is_terminated:
            
            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)
            q_predict = q_table.loc[S,A]
            
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_,:].max()
            else:
                q_target = R
                is_terminated = True
                
            q_table.loc[S,A] += ALPHA * (q_target - q_predict)
            S = S_
            
            stepcounter+=1
            update_env(S, episode, stepcounter)
    print(q_table)    
    return q_table

In [18]:
if __name__ == "__main__":
    q_table = rl()

           left     right
0  5.597938e-03  0.063449
1  9.087641e-09  0.138166
2  3.407406e-03  0.265018
3  7.786201e-06  0.445637
4  5.127062e-02  0.641600
5  2.891858e-02  0.837911
6  1.471208e-01  0.985219
7  0.000000e+00  0.000000
