In [1]:
import tkinter as tk
import numpy as np
import time

In [2]:

class Agent:
    
    # Agent will hold a map (which represent record table)
    def __init__(self, table_size=(5,5), epsilon=0.9, alpha=0.1, gamma=0.8, action_list=['up','down','left','right']):
        self._TABLE_SIZE = table_size
        self._EPSILON = epsilon
        self._ALPHA = alpha # learning rate
        self._GAMMA = gamma
        
        self.ACTION_SIZE = len(action_list)
        self.ACTION_LIST = action_list
        
        self.table = np.zeros(table_size + (len(action_list),), dtype='float16')  
    
    def new_episode(self):
        self.state = (0,0)
        return self.state
    
    def learn(self):
        pass
    
class QLearningAgent(Agent):
    def __init__(self, table_size=(5,5), epsilon=0.9, alpha=0.1, gamma=0.8, action_list=['up','down','left','right']):
        super().__init__(table_size=table_size, epsilon=epsilon, alpha=alpha, gamma=gamma, action_list=action_list)
    
    def learn(self, reward, action, next_state):
        action_idx = self.ACTION_LIST.index(action)        
        origin = self.table[self.state+(action_idx,)]
        #### choose max reward from next_state
        prediction = self._GAMMA * self.table[next_state].max()
        
        self.table[self.state+(action_idx,)] = origin + self._ALPHA*(reward+prediction-origin)
        
    def choose_action(self, state):
        # epsilon-greedy
        rate = np.random.rand()
        if rate > self._EPSILON or (self.table[state].max()==0 and self.table[state].min()==0):
            # choose randomly
            action = np.random.choice(self.ACTION_LIST)
        elif self.table[state].max()==0 and self.table[state].min!=0:
            # choose randomly from non-negative reward action
            p_list = np.array([0] * self.ACTION_SIZE)
            count= 0
            for idx, reward in enumerate(self.table[state]):
                if reward==0:
                    p_list[idx]=1
                    count+=1
            p_list = list(p_list/count)
            actionIdx = np.random.choice(self.ACTION_SIZE, 1, p_list)[0]
            action = self.ACTION_LIST[actionIdx]
        else:
            # choose the action which contain max reward
            actionIdx_with_highestReward = self.table[state].argmax()
            action = self.ACTION_LIST[actionIdx_with_highestReward]
        
        return action
    
class SarsaAgent(Agent):
    def __init__(self, table_size=(5,5), epsilon=0.9, alpha=0.1, gamma=0.8, action_list=['up','down','left','right']):
        super().__init__(table_size=table_size, epsilon=epsilon, alpha=alpha, gamma=gamma, action_list=action_list)
    
    def learn(self, reward, action, next_action, next_state):
        action_idx = self.ACTION_LIST.index(action)
        next_action_idx = self.ACTION_LIST.index(next_action)
        
        origin = self.table[self.state+(action_idx,)]
        ### choose exact reward according to next state and action
        prediction = self._GAMMA * self.table[next_state+(next_action_idx,)]
        
        self.table[self.state+(action_idx,)] = origin + self._ALPHA*(reward+prediction-origin)
    
    def choose_action(self, state):
        # epsilon-greedy
        rate = np.random.rand()
        if rate > self._EPSILON or (self.table[state].max()==0 and self.table[state].min()==0):
            # choose randomly
            action = np.random.choice(self.ACTION_LIST)
        
        else:
            actionIdx_with_highestReward = self.table[state].argmax()
            action = self.ACTION_LIST[actionIdx_with_highestReward]
        
        return action

In [14]:
class Env(tk.Tk):
    def __init__(self, size=(5,5), target=(3,3), target_reward=10,
                 fail_list=[(3,2),(2,3)], fail_punishment_list=[-5, -5],
                 wall_punishment = -1, pxl_unit=40):
        super().__init__()
        self.SIZE = size
        self.PXL_UNIT = pxl_unit
        self.title('maze')
        self.geometry('{0}x{1}'.format(size[0] * pxl_unit, size[1] * pxl_unit * 4))
        
#         self._MAP_SIZE = map_size
        self._WALL_PUNISHMENT = wall_punishment
        
        self.REWARD_MAP = self._assign_reward_to_map(target, target_reward,
                                               fail_list, fail_punishment_list)
    
        self._build_maze(target, fail_list)
    
    def _assign_reward_to_map(self, target, target_reward, fail_list, fail_punishment_list):
        
        tmp_map = np.zeros(self.SIZE, dtype='int')
        
        # assign reward when reach the target 
        tmp_map[target] = target_reward
        
        # assign failure punishment
        for coordinate, punishment in zip(fail_list, fail_punishment_list):
            tmp_map[coordinate] = punishment
        
        return tmp_map

    def _build_maze(self, target, fail_list):
        self.canvas = tk.Canvas(self, bg='gray',
                           height=self.SIZE[0] * self.PXL_UNIT,
                           width= self.SIZE[1] * self.PXL_UNIT)

        # create grids
        for c in range(0, self.SIZE[1] * self.PXL_UNIT, self.PXL_UNIT):
            x0, y0, x1, y1 = c, 0, c, self.SIZE[1] * self.PXL_UNIT
            self.canvas.create_line(x0, y0, x1, y1)
        for r in range(0, self.SIZE[0] * self.PXL_UNIT, self.PXL_UNIT):
            x0, y0, x1, y1 = 0, r, self.SIZE[0] * self.PXL_UNIT, r
            self.canvas.create_line(x0, y0, x1, y1)

        # create origin
        origin = np.array([20, 20])
        
        # create oval
        target_center = origin + np.array(target) * self.PXL_UNIT
        self.oval = self.canvas.create_oval(
            target_center[0] - 15, target_center[1] - 15,
            target_center[0] + 15, target_center[1] + 15,
            fill='yellow')
        
        for coordinate in fail_list:
            center = origin + np.array(coordinate) * self.PXL_UNIT
            self.canvas.create_rectangle(
                center[1] - 15, center[0] - 15,
                center[1] + 15, center[0] + 15,
                fill='black')
        
#         # hell
#         hell1_center = origin + np.array([self.PXL_UNIT * 2, self.PXL_UNIT])
#         self.hell1 = self.canvas.create_rectangle(
#             hell1_center[0] - 15, hell1_center[1] - 15,
#             hell1_center[0] + 15, hell1_center[1] + 15,
#             fill='black')
#         # hell
#         hell2_center = origin + np.array([self.PXL_UNIT, self.PXL_UNIT * 2])
#         self.hell2 = self.canvas.create_rectangle(
#             hell2_center[0] - 15, hell2_center[1] - 15,
#             hell2_center[0] + 15, hell2_center[1] + 15,
#             fill='black')

        # create red rect :current state
        self.agent_rect = self.canvas.create_rectangle(
            origin[0] - 10, origin[1] - 10,
            origin[0] + 10, origin[1] + 10,
            fill='red')

        # pack all
        self.canvas.pack()
    
    def take_action(self, state, action):
        reward = 0
        next_state = state
        terminal = False
        move = (0,0)
        if action=='up':
            if state[0]==0:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                move = (-1,0)
                next_state = (state[0]-1, state[1])
                reward = self.REWARD_MAP[state]
        elif action=='down':
            if state[0]==self.SIZE[0]-1:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                move=(1,0)
                next_state = (state[0]+1, state[1])
                reward = self.REWARD_MAP[state]
        elif action=='left':
            if state[1]==0:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                move=(0,-1)
                next_state = (state[0], state[1]-1)
                reward = self.REWARD_MAP[state]
        elif action=='right':
            if state[1]==self.SIZE[1]-1:
                next_state = state # stay in place
                reward = self._WALL_PUNISHMENT
            else:
                move=(0,1)
                next_state = (state[0], state[1]+1)
                reward = self.REWARD_MAP[state]
        
        self.canvas.move(self.agent_rect, move[1]*self.PXL_UNIT, move[0]*self.PXL_UNIT)  # move agent
#         self.update()
        # check if terminal
        if self.REWARD_MAP[next_state]!=0:
            terminal=True
            
        return next_state, reward, terminal
    
    def reset(self):
        self.update()
        time.sleep(0.5)
        self.canvas.delete(self.agent_rect)
        origin = np.array([20, 20])
        self.agent_rect = self.canvas.create_rectangle(
            origin[0] - 10, origin[1] - 10,
            origin[0] + 10, origin[1] + 10,
            fill='red')
        # return observation
#         return self.canvas.coords(self.rect)
    def showEnvInfo(self, next_state, reward, terminal):
        print("-->{}".format(next_state),end='')
        if terminal:
            if self.REWARD_MAP[next_state]>0:
                print("  >>>win<<<")
            elif self.REWARD_MAP[next_state]<0:
                print("  >>>fail<<<")
            else:
                print("(*&)(*({)(something wrong~")
        self.update()
   
    def render(self):
#         time.sleep(0.1)
        self.update()

In [15]:
######################################
EPISODE = 30
STEP_DELAY = 1
EPISODE_DELAY = 2
######################################
# main function
def process_Sarsa(agent):
    for epi in range(EPISODE):
        state = agent.new_episode() # init state
        action = agent.choose_action(agent.state)

        print("episode : {}".format(epi),end='\n')
        terminal = False
        while not terminal:
            next_state, reward, terminal = env.take_action(state, action)
            next_action = agent.choose_action(next_state)
            ### update sarsa table
            agent.learn(reward, action, next_action, next_state)
            ###
            
            state = agent.state = next_state
            action = next_action
            time.sleep(STEP_DELAY)
            
        time.sleep(EPISODE_DELAY)

def process_QLearning():
    agent = q_agent
    for epi in range(EPISODE):
        env.reset()
        state = agent.new_episode()
        
        print("{}".format(agent.state),end='')
        terminal = False
        while not terminal:
            state = agent.state
            action = agent.choose_action(state)
            next_state, reward, terminal = env.take_action(state, action)
            ### update sarsa table
            agent.learn(reward, action, next_state)
            ###
            env.showEnvInfo(next_state, reward, terminal)
            agent.state = next_state
            time.sleep(STEP_DELAY)
        time.sleep(EPISODE_DELAY)
    
################################################
env = Env(fail_list=[(2,3),(3,1)])
s_agent = SarsaAgent() # init agent and Q-table
q_agent = QLearningAgent()


env.after(1000, process_QLearning)
env.mainloop()

# process_Sarsa(s_agent)


(0, 0)-->(0, 0)-->(0, 0)-->(0, 0)-->(0, 0)-->(0, 1)-->(0, 2)-->(0, 1)-->(0, 1)-->(0, 1)-->(0, 1)-->(0, 1)-->(0, 2)-->(0, 3)-->(0, 3)-->(0, 4)

Exception in Tkinter callback
Traceback (most recent call last):
  File "/Users/bl515/anaconda/lib/python3.6/tkinter/__init__.py", line 1699, in __call__
    return self.func(*args)
  File "/Users/bl515/anaconda/lib/python3.6/tkinter/__init__.py", line 745, in callit
    func(*args)
  File "<ipython-input-15-5c2aee173092>", line 38, in process_QLearning
    next_state, reward, terminal = env.take_action(state, action)
  File "<ipython-input-14-a6785d908062>", line 122, in take_action
    self.canvas.move(self.agent_rect, move[1]*self.PXL_UNIT, move[0]*self.PXL_UNIT)  # move agent
  File "/Users/bl515/anaconda/lib/python3.6/tkinter/__init__.py", line 2585, in move
    self.tk.call((self._w, 'move') + args)
_tkinter.TclError: invalid command name ".!canvas"


In [None]:
a = (1,2)
np.array(a)