In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
np.random.seed(2)

N_STATES=6
ACTIONS=['left','right']
EPSILON= 0.9
ALPHA =0.1  #learning rate
LAMBDA =0.9  #discount factor
MAX_EPISODES=13
FRESH_TIME=0.2


In [3]:
def build_q_table(n_state,actions):
    table=pd.DataFrame(
        np.zeros((n_state,len(actions))),
        columns =actions,
    )
    return table 
def choose_action(state, q_table):
    #选取table 里的一行
    state_actions= q_table.iloc[state,:]
    if(np.random.uniform()>EPSILON) or ((state_actions == 0).all()):
        action_name = np.random.choice(ACTIONS)
    else:   #greedy
        # 从left or right 里选取最大的值
        action_name = state_actions.idxmax()    
    return action_name
def get_mov_feedback(S,A):
    if A=='right':
        if S==N_STATES-2:
            S_ = 'terminal'
            R = 1
        else:
            S_=S+1
            R =0
    else:
        R =0 
        if S ==0:
            S_ =S
        else:
            S_=S-1
    return S_,R
def update_env(S, episode, step_counter):
    # print function 环境
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)
def rl():
    q_table= build_q_table(N_STATES,ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter =0
        S=0
        is_terminated=False
        update_env(S,episode,step_counter)
        while not is_terminated:
            A= choose_action(S,q_table)
            S_ ,R =get_mov_feedback(S,A)
            #估计值
            q_predict =q_table.loc[q_table.index[S],A]
            if S_ !='terminal':
                q_target = R + LAMBDA * q_table.iloc[S_, :].max()   # next state is not terminal
            else:
                #回合终止
                q_target = R 
                print("Episode:",episode," step:",step_counter+1)
                print(q_table)# next state is terminal
                is_terminated = True    # terminate this episode
            #Q(s,a)=R(s)+  ALPHA* P(s'|s,a)maxa' Q(s',a')
            
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table
q_table=rl()
print('\r\nQ-table:\n')
print(q_table)

----oTEpisode: 0  step: 38
   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0
----oT                          Episode: 1  step: 22
   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.100
5   0.0  0.000
----oT                          Episode: 2  step: 9
   left    right
0   0.0  0.00000
1   0.0  0.00000
2   0.0  0.00081
3   0.0  0.02520
4   0.0  0.19000
5   0.0  0.00000
----oT                          Episode: 3  step: 5
   left     right
0   0.0  0.000000
1   0.0  0.000073
2   0.0  0.002997
3   0.0  0.047070
4   0.0  0.271000
5   0.0  0.000000
----oT                          Episode: 4  step: 7
      left     right
0  0.00000  0.000007
1  0.00000  0.000572
2  0.00003  0.006934
3  0.00000  0.073314
4  0.00000  0.343900
5  0.00000  0.000000
----oT                          Episode: 5  step: 5
      left     right
0  0.00000  0.000057
1  0.00000  0.001138
2  0.00003  0.012839
3  0.00000  0.102839
4  0.