In [76]:
import numpy as np
import time

In [1]:
class Env:
    def __init__(self, map_size=(5,5), target=(3,3), target_reward=10,
                 fail_list=[(3,2),(2,3)], fail_punishment_list=[-5, -5],
                 wall_punishment = -1):
        self._MAP_SIZE = map_size
        self._WALL_PUNISHMENT = wall_punishment
        
        self.TARGET = target
        self.FAIL_LIST = fail_list
        self.REWARD_MAP = self._assign_reward_to_map(target, target_reward,
                                               fail_list, fail_punishment_list)
        
    def _assign_reward_to_map(self, target, target_reward, fail_list, fail_punishment_list):
        tmp_map = np.zeros(self._MAP_SIZE, dtype='int')
        target = self.TARGET
        fail_list = self.FAIL_LIST
        
        # assign reward when reach the target 
        tmp_map[target] = target_reward
        
        # assign failure punishment
        for coordinate, punishment in zip(fail_list, fail_punishment_list):
            tmp_map[coordinate] = punishment
        
        return tmp_map
    
    def take_action(self, state, action):
        reward = 0
        next_state = state
        terminal = False
        
        if action=='up':
            if state[0]==0:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                next_state = (state[0]-1, state[1])
                reward = self.REWARD_MAP[state]
        elif action=='down':
            if state[0]==self._MAP_SIZE[0]-1:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                next_state = (state[0]+1, state[1])
                reward = self.REWARD_MAP[state]
        elif action=='left':
            if state[1]==0:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                next_state = (state[0], state[1]-1)
                reward = self.REWARD_MAP[state]
        elif action=='right':
            if state[1]==self._MAP_SIZE[1]-1:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                next_state = (state[0], state[1]+1)
                reward = self.REWARD_MAP[state]
        
        # check if terminal
        if self.REWARD_MAP[next_state]!=0:
            terminal=True
            
        return next_state, reward, terminal
    
    
    def showEnvInfo(self, next_state, reward, terminal):
        print("-->{}".format(next_state),end='')
        if terminal:
            if self.REWARD_MAP[next_state]>0:
                print("  >>>win<<<")
            elif self.REWARD_MAP[next_state]<0:
                print("  >>>fail<<<")
            else:
                print("(*&)(*({)(something wrong~")
    
    def render(self):
        time.sleep(0.1)
        self.update()

In [110]:

class Agent:
    
    # Agent will hold a map (which represent record table)
    def __init__(self, table_size=(5,5), epsilon=0.9, alpha=0.1, gamma=0.8, action_list=['up','down','left','right']):
        self._TABLE_SIZE = table_size
        self._EPSILON = epsilon
        self._ALPHA = alpha # learning rate
        self._GAMMA = gamma
        
        self.ACTION_SIZE = len(action_list)
        self.ACTION_LIST = action_list
        
        self.table = np.zeros(table_size + (len(action_list),), dtype='float16')  
    
    def new_episode(self):
        self.state = (0,0)
        return self.state
    
    def learn(self):
        pass
    
class QLearningAgent(Agent):
    def __init__(self, table_size=(5,5), epsilon=0.9, alpha=0.1, gamma=0.8, action_list=['up','down','left','right']):
        super().__init__(table_size=table_size, epsilon=epsilon, alpha=alpha, gamma=gamma, action_list=action_list)
    
    def learn(self, reward, action, next_state):
        action_idx = self.ACTION_LIST.index(action)        
        origin = self.table[self.state+(action_idx,)]
        #### choose max reward from next_state
        prediction = self._GAMMA * self.table[next_state].max()
        
        self.table[self.state+(action_idx,)] = origin + self._ALPHA*(reward+prediction-origin)
        
    def choose_action(self, state):
        # epsilon-greedy
        rate = np.random.rand()
        if rate > self._EPSILON or (self.table[state].max()==0 and self.table[state].min()==0):
            # choose randomly
            action = np.random.choice(self.ACTION_LIST)
        elif self.table[state].max()==0 and self.table[state].min!=0:
            # choose randomly from non-negative reward action
            p_list = np.array([0] * self.ACTION_SIZE)
            count= 0
            for idx, reward in enumerate(self.table[state]):
                if reward==0:
                    p_list[idx]=1
                    count+=1
            p_list = list(p_list/count)
            actionIdx = np.random.choice(self.ACTION_SIZE, 1, p_list)[0]
            action = self.ACTION_LIST[actionIdx]
        else:
            # choose the action which contain max reward
            actionIdx_with_highestReward = self.table[state].argmax()
            action = self.ACTION_LIST[actionIdx_with_highestReward]
        
        return action
    
class SarsaAgent(Agent):
    def __init__(self, table_size=(5,5), epsilon=0.9, alpha=0.1, gamma=0.8, action_list=['up','down','left','right']):
        super().__init__(table_size=table_size, epsilon=epsilon, alpha=alpha, gamma=gamma, action_list=action_list)
    
    def learn(self, reward, action, next_action, next_state):
        action_idx = self.ACTION_LIST.index(action)
        next_action_idx = self.ACTION_LIST.index(next_action)
        
        origin = self.table[self.state+(action_idx,)]
        ### choose exact reward according to next state and action
        prediction = self._GAMMA * self.table[next_state+(next_action_idx,)]
        
        self.table[self.state+(action_idx,)] = origin + self._ALPHA*(reward+prediction-origin)
    
    def choose_action(self, state):
        # epsilon-greedy
        rate = np.random.rand()
        if rate > self._EPSILON or (self.table[state].max()==0 and self.table[state].min()==0):
            # choose randomly
            action = np.random.choice(self.ACTION_LIST)
        
        else:
            actionIdx_with_highestReward = self.table[state].argmax()
            action = self.ACTION_LIST[actionIdx_with_highestReward]
        
        return action

In [115]:


            
######################################

EPISODE = 30
STEP_DELAY = 0.1
EPISODE_DELAY = 2

######################################
# main function
def process_Sarsa(agent):
    for epi in range(EPISODE):
        state = agent.new_episode() # init state
        action = agent.choose_action(agent.state)

        print("{}".format(state),end='')
        terminal = False
        while not terminal:
            next_state, reward, terminal = env.take_action(state, action)
            next_action = agent.choose_action(next_state)
            ### update sarsa table
            agent.learn(reward, action, next_action, next_state)
            ###
            env.showEnvInfo(next_state, reward, terminal)
            
            state = agent.state = next_state
            action = next_action
            time.sleep(STEP_DELAY)
            
        time.sleep(EPISODE_DELAY)

def process_QLearning(agent):
    for epi in range(EPISODE):
        state = agent.new_episode()
        
        print("{}".format(agent.state),end='')
        terminal = False
        while not terminal:
            state = agent.state
            action = agent.choose_action(state)
            next_state, reward, terminal = env.take_action(state, action)
            ### update sarsa table
            agent.learn(reward, action, next_state)
            ###
            env.showEnvInfo(next_state, reward, terminal)
            agent.state = next_state
            time.sleep(STEP_DELAY)
            
        time.sleep(EPISODE_DELAY)
    
################################################
env = Env(fail_list=[(2,3),(3,1)])
s_agent = SarsaAgent() # init agent and Q-table
q_agent = QLearningAgent()
    
process_QLearning(q_agent)
# process_Sarsa(s_agent)


(0, 0)-->(0, 0)-->(0, 1)-->(0, 1)-->(0, 2)-->(0, 1)-->(0, 2)-->(0, 3)-->(1, 3)-->(1, 2)-->(1, 3)-->(2, 3)  >>>fail<<<
(0, 0)-->(0, 0)-->(0, 0)-->(0, 0)-->(1, 0)-->(1, 1)-->(0, 1)-->(0, 0)-->(0, 0)-->(0, 0)-->(0, 1)-->(0, 2)-->(0, 3)-->(0, 4)-->(0, 4)-->(0, 4)-->(0, 4)-->(0, 3)-->(0, 4)-->(1, 4)-->(1, 4)-->(2, 4)-->(1, 4)-->(1, 3)-->(1, 4)-->(0, 4)-->(1, 4)-->(1, 3)-->(1, 2)-->(0, 2)-->(1, 2)-->(0, 2)-->(0, 3)-->(0, 3)-->(0, 4)-->(0, 3)-->(0, 2)-->(0, 3)-->(0, 2)-->(1, 2)-->(0, 2)-->(1, 2)-->(2, 2)-->(3, 2)-->(2, 2)-->(2, 1)-->(2, 0)-->(1, 0)-->(1, 0)-->(2, 0)-->(2, 0)-->(2, 1)-->(2, 0)-->(3, 0)-->(3, 0)-->(4, 0)-->(4, 0)-->(4, 0)-->(3, 0)-->(3, 0)-->(2, 0)-->(3, 0)-->(2, 0)-->(1, 0)-->(0, 0)-->(1, 0)-->(2, 0)-->(3, 0)-->(3, 1)  >>>fail<<<
(0, 0)-->(1, 0)-->(1, 0)-->(1, 1)-->(1, 2)-->(2, 2)-->(2, 3)  >>>fail<<<
(0, 0)-->(0, 0)-->(0, 1)-->(0, 1)-->(0, 2)-->(0, 1)-->(0, 1)-->(1, 1)-->(1, 2)-->(2, 2)-->(1, 2)-->(0, 2)-->(0, 2)-->(0, 2)-->(0, 2)-->(0, 1)-->(0, 2)-->(0, 3)-->(0, 3)-->(1, 3)-

In [116]:
q_agent.table

array([[[-0.93505859,  0.        , -0.890625  ,  0.        ],
        [-0.87841797,  0.        ,  0.        ,  0.        ],
        [-0.68603516,  0.        ,  0.        ,  0.        ],
        [-0.56982422,  0.        ,  0.        ,  0.        ],
        [-0.27099609,  0.        ,  0.        , -0.52197266]],

       [[ 0.        ,  0.        , -0.65136719,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , -0.46875   ]],

       [[ 0.        ,  0.        , -0.61279297,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , -0.18994141]],

       [[ 0.        ,  0.        , -0.68603516,  0.        ],
  

In [23]:
import numpy as np
# np.random.rand(1)[0]

np.random.random_sample()

0.9757747874034515

In [13]:
a = (2,3)
b= (2,)
a+b

(2, 3, 2)

In [24]:

vocb_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
np.random.choice(vocb_arr, replace=False, p=[0.5, 0.1, 0.1, 0.3])

'pooh'

In [32]:
a = np.array([[[1,2],[3,4]],[[5,6],[7,8]]])
a[(1,0)]

array([5, 6])

In [31]:
a = (2,1)
b= (3,0)
a +b

(2, 1, 3, 0)

In [34]:
class a:
    def __init__(self):
        pass
    def printA(self):
        print('a')
class b(a):
    def __init__(self):
        pass
    def printA(self):
        print('b')
        
B = b()
B.printA()

b


In [103]:
np.random.choice(5, 1, p=[0.1, 0, 0.3, 0.6, 0])


array([2])