In [1]:
%matplotlib tk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb

In [2]:
def change_range(values, vmin=0, vmax=1):
    start_zero = values - np.min(values)
    return (start_zero / (np.max(start_zero) + 1e-7)) * (vmax - vmin) + vmin

In [3]:
class GridWorld:
    terrain_color = dict(normal=[127/360, 0, 96/100],
                         objective=[26/360, 100/100, 100/100],
                         puddle=[247/360, 92/100, 70/100],
                         player=[344/360, 93/100, 100/100])
        
    def __init__(self, jump_success = 0.2):
        self.player = None
        self._create_grid()  
#         self._draw_grid()
        self.p_jump = jump_success
        self.flag = 0
        
    def _create_grid(self, initial_grid=None):
        self.grid = self.terrain_color['normal'] * np.ones((10, 10, 3))
        self._add_objectives(self.grid)
        
    def _add_objectives(self, grid):
        grid[-3, :] = self.terrain_color['puddle']
        grid[0, -1] = self.terrain_color['objective']
#         grid[-1, -1] = self.terrain_color['objective']
        
    def _draw_grid(self):
        self.fig, self.ax = plt.subplots(figsize=(15, 15))
        self.ax.grid(which='minor')       
        self.q_texts = [self.ax.text(*self._id_to_position(i)[::-1], '0',
                                     fontsize=6, verticalalignment='center', 
                                     horizontalalignment='center') for i in range(10 * 10)]     
         
        self.im = self.ax.imshow(hsv_to_rgb(self.grid), cmap='terrain',
                                 interpolation='nearest', vmin=0, vmax=1)        
        self.ax.set_xticks(np.arange(10))
        self.ax.set_xticks(np.arange(10) - 0.5, minor=True)
        self.ax.set_yticks(np.arange(10))
        self.ax.set_yticks(np.arange(10) - 0.5, minor=True)
        
    def reset(self):
        self.player = (9, 0)
        self.flag = 0
        return self._position_to_id(self.player)
    
    def step(self, action):
        # Possible actions
        #up
        if action == 0 and (self.player[0] > 0 and self.player[0]!=8):  
            self.player = (self.player[0] - 1, self.player[1])
        elif action == 0 and self.player[0] == 8:
            self.player = (9, 0)
            
        #down
        if action == 1 and (self.player[0] < 9 and self.player[0]!=6): 
            self.player = (self.player[0] + 1, self.player[1])
        elif action == 1 and self.player[0] == 6:
            self.player = (9, 0)
            
        #right
        if action == 2 and self.player[1] < 9: 
            self.player = (self.player[0], self.player[1] + 1)
        
        
        #left
        if action == 3 and self.player[1] > 0:  
            self.player = (self.player[0], self.player[1] - 1)
            
        #jump
        if action == 4 and self.player[0] == 8:
            if np.random.random() < self.p_jump:
                self.player = (6, self.player[1])
                if self.flag == 0:
                    self.flag = 1
        elif action == 4 and self.player[0] == 6:
            if np.random.random() < self.p_jump:
                self.player = (8, self.player[1])
        
        # Rules
        if self.player == (0,9):
            reward = 1000
            done = True
        else:
            reward = -0.05
            done = False
            
        ## partially observable state 8 = state 19
        if self._position_to_id(self.player) == 19:
            next_position = 8
        else:
            next_position = self._position_to_id(self.player)
        return next_position, reward, done
    
    def _position_to_id(self, pos):
        ''' Maps a position in x,y coordinates to a unique ID '''
        return pos[0] * 10 + pos[1]
    
    def _id_to_position(self, idx):
        return (idx // 10), (idx % 10)
        
    def render(self, q_values=None, q_counts=None, action=None, max_q=False, colorize_q=False, show_count=False):
        assert self.player is not None, 'You first need to call .reset()'  
        
        if colorize_q:
            assert q_values is not None, 'q_values must not be None for using colorize_q'            
            grid = self.terrain_color['normal'] * np.ones((10, 10, 3))
            values = change_range(np.max(q_values, -1)).reshape(10, 10)
            grid[:, :, 1] = values
            self._add_objectives(grid)
        else:            
            grid = self.grid.copy()
            
        grid[self.player] = self.terrain_color['player']       
        self.im.set_data(hsv_to_rgb(grid))
               
        if q_values is not None:
            xs = np.repeat(np.arange(10), 10)
            ys = np.tile(np.arange(10), 10)  
            
            for i, text in enumerate(self.q_texts):
                if max_q:
                    q = max(q_values[i])    
                    txt = '{:.2f}'.format(q)
                    text.set_text(txt)
                elif show_count:                
                    actions = ['U', 'D', 'R', 'L', 'J']
                    txt = '\n'.join(['{}: {:.2f}'.format(k, q) for k, q in zip(actions, q_counts[i])])
                    text.set_text(txt)
                else:
                    actions = ['U', 'D', 'R', 'L', 'J']
                    txt = '\n'.join(['{}: {:.2f}'.format(k, q) for k, q in zip(actions, q_values[i])])
                    text.set_text(txt)
                
        if action is not None:
            self.ax.set_title(action, color='r', weight='bold', fontsize=32)

        plt.pause(0.1)

In [4]:
UP = 0
DOWN = 1
RIGHT = 2
LEFT = 3
Jump = 4
actions = ['UP', 'DOWN', 'RIGHT', 'LEFT', 'JUMP']

In [5]:
env = GridWorld(jump_success=0.2)


In [6]:
# The number of states in simply the number of "squares" in our grid world, in this case 4 * 12
num_states = 10 * 10
# We have 4 possible actions, up, down, right and left
num_actions = 5

q_values = np.zeros((num_states, num_actions))

In [7]:
df = pd.DataFrame(q_values, columns=[' up ', 'down', 'right', 'left', 'jump'])
df.index.name = 'States'
df.head()

Unnamed: 0_level_0,up,down,right,left,jump
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


In [8]:
def manh_d(s1, s2):
    return abs(s1[0]-s2[0]) + abs(s1[1]-s2[1])

In [9]:
def PBRS(state, u0=10, u1=100):
    s = (state // 10), (state % 10)
    if s[0] > 7:
            phi = u0/5
    elif s[0] < 7:
            phi = u1/5
    else:
        phi = 0
    return phi*1

In [10]:
def PBA(state, action, u0=10, u1=100):
    s = (state // 10), (state % 10)
    if s[0] > 7:
        if s[0] == 8:
            if action == 4:
                phi = u0*0.6
            else:
                phi = u0*0.1
        else:
            phi = u0*0.2

            
    elif s[0] < 7:
#         phi = u1*(-manh_d(s_p, c2) + manh_d(s, c2))
        if s[0] != 0:
            if s[1] != 9:
                if action in [0,2]:
                    phi = u1*0.7/2
                else:
                    phi = u1*0.1
            else:
                if action == 0:
                    phi = u1*0.6
                else:
                    phi = u1*0.1
        else:
            if action == 2:
                phi = u1*0.6
            else:
                phi = u1*0.1
    else:
        phi = 0
    
    return phi*1

In [11]:
def egreedy_policy(q_values, state, epsilon=0.1):
    ''' 
    Choose an action based on a epsilon greedy policy.    
    A random action is selected with epsilon probability, else select the best action.    
    '''
    if np.random.random() < epsilon:
        return np.random.choice(5)
    else:
        return np.argmax(q_values[state])

In [12]:
def choose_action(q_values, state, epsilon=0.1):
    ''' 
    Choose an action based on a epsilon greedy policy.    
    A random action is selected with epsilon probability, else select the best action.    
    '''
    prob = np.exp(q_values[state])/np.sum(np.exp(q_values[state]))
    return int(np.random.choice(len(prob), 1, p=prob))

In [13]:
def clamp(n, minn=-500, maxn=500):
    return max(min(maxn, n), minn)

In [14]:
def Actor_critic(env, adv = 0, num_episodes=1000, learning_rate_p=0.001, learning_rate_q=0.05, gamma=1.0):
    q_values = np.zeros((num_states, num_actions))
    policy_para = np.zeros((num_states, num_actions))
    v_values = np.zeros((num_states,1))
    q_counts = np.zeros((num_states, num_actions))
    ep_rewards_train = []
    ep_rewards_test = []
    ep_steps_train = []
    ep_steps_test = []
    
    ep_next_action_pred = []
    
    for current_episode in range(num_episodes):
        state = env.reset()
        state_p = state
        action_p = 0
        done = False
        reward_sum_train = 0
        reward_sum_test = 0
        current_steps_train = 0
        current_steps_test = 0
        
        ep_obs = []
        ep_as = []
        ep_rs = []
        next_action_save = []
        action_save = []
#         next_action = choose_action(policy_para, state)
        while not done:            
            # Choose action
            # proposed
            

            action = choose_action(policy_para, state)
#             action = next_action
            
            # Do the action
            next_state, reward, done = env.step(action)
            
            next_action = choose_action(policy_para, next_state)
        
            ep_obs.append(state)
            ep_as.append(action)
            ep_rs.append(reward)
            
            next_action_save.append(next_action)
            action_save.append(action)
            
            ## update Q
            if adv == 1:

                td_target = reward + gamma*PBRS(next_state, 10, 100) \
                                    - PBRS(state, 10, 100) + gamma * v_values[next_state]
                td_error = td_target - v_values[state]
                v_values[state] += learning_rate_q * td_error
            
                expected_reward = td_error

                log_grad = (1-np.exp(policy_para[state][action])/np.sum(np.exp(policy_para[state])))
                policy_para[state][action] = policy_para[state][action] + \
                                                                learning_rate_p*expected_reward*log_grad
            elif adv == 2:
##########################################look back
                td_target = reward + PBA(state, action, 10, 100) \
                                    - PBA(state_p, action_p, 10, 100)/gamma + gamma * v_values[next_state]
                td_error = td_target - v_values[state]
                v_values[state] += learning_rate_q * td_error
                
                expected_reward = td_error
    
                log_grad = (1-np.exp(policy_para[state][action])/np.sum(np.exp(policy_para[state])))
                policy_para[state][action] = policy_para[state][action] + \
                                                                learning_rate_p*expected_reward*log_grad
##########################################look forward
#                 td_target = reward + gamma*PBA(next_state, next_action, 10, 100) \
#                                     - PBA(state, action, 10, 100) + gamma * v_values[next_state]
#                 td_error = td_target - v_values[state]
#                 v_values[state] += learning_rate_q * td_error
                
#                 expected_reward = td_error + PBA(state, action, 10, 100) 
    
#                 log_grad = (1-np.exp(policy_para[state][action])/np.sum(np.exp(policy_para[state])))
#                 policy_para[state][action] = policy_para[state][action] + \
#                                                                 learning_rate_p*expected_reward*log_grad
            

            else:
                
                td_target = reward + gamma * v_values[next_state]
                td_error = td_target - v_values[state]
                v_values[state] += learning_rate_q * td_error

                ##update policy
                expected_reward = td_error
                log_grad = (1-np.exp(policy_para[state][action])/np.sum(np.exp(policy_para[state])))
                policy_para[state][action] = policy_para[state][action] + \
                                                                learning_rate_p*expected_reward*log_grad
            
            
            reward_sum_train += reward
            current_steps_train += 1
                        
            state_p = state    
            action_p = action
            state = next_state
            
            if current_steps_train>1000:
                break
                
#             if done:
#                 print(current_steps_train, current_episode)
            
        ep_rewards_train.append(reward_sum_train)

        ep_next_action_pred.append(sum(np.array(action_save[1:] ) 
                                       == np.array(next_action_save[:-1]))/len(action_save[1:]))
    return ep_rewards_train, q_values, ep_next_action_pred

In [15]:
ep_reward_train_store = []
ep_next_action_pred_store = []
env = GridWorld(jump_success=0.2)
    
    
# @parameter adv: 
# 0: no advice
# 1: Uniform SAS
# 2: NonUniform SAS 
for ii in range(10):
    ep_rewards_train, q_values, ep_next_action_pred = Actor_critic(env, adv = 2, num_episodes=200, \
                                              learning_rate_p=0.001, learning_rate_q=0.2)
    ep_reward_train_store.append(ep_rewards_train)
    ep_next_action_pred_store.append(ep_next_action_pred)
    print(ii)
    
avg_reward_train = np.mean(np.array(ep_reward_train_store),axis=0)
mean_reward_train = [np.mean(avg_reward_train)] * len(avg_reward_train)


fig, ax = plt.subplots()
ax.set_xlabel('Episodes')
ax.set_ylabel('Rewards')
ax.plot(avg_reward_train)
ax.plot(mean_reward_train, 'g--')
ax.set_xlim([0,200])


print('Mean rewards: {}'.format(mean_reward_train[0]))

0
1
2
3
4
5
6
7
8
9
Mean rewards: 973.894125
