In [1]:
import sys
import gym
import numpy as np
from collections import defaultdict
import pdb
# from Environment import *

In [19]:
import numpy as np

class Environment:
    
    def __init__(self,Map,random_start=True):
        
        self.MAPS ={
               '4x4':np.array([['s','f','f','h'],
                               ['h','h','f','h'],
                               ['h','f','f','h'],
                               ['h','f','f','g']])
        }
        
        self.NA = 4
        self.rand_start = random_start
        self.selected_map = self.MAPS[Map]
        self.start_position = (0,0)
        self.goal_position  = [3,3]
        self.current_position = [0,0]
        self.actions = ['r','l','d','u']
        
    def reset(self):
        if self.rand_start == False:
            self.current_position = [0,0]
            return tuple(self.current_position.copy())
        else:
            self.current_position = [np.random.randint(0,4),np.random.randint(0,4)]
            return tuple(self.current_position.copy())
    
    def randomReset(self):
        self.current_position[0] = np.random.randint(0,4)
        self.current_position[1] = np.random.randint(0,4)
        return self.current_position.copy()
        
    def render(self):
        print(self.selected_map)
        
    def step(self,action):
        flag = False
        if action == 'u' and self.current_position[0] != 0:
            self.current_position[0] -= 1
            flag = True
        
        if action == 'd' and self.current_position[0] != self.selected_map.shape[0]-1:
            self.current_position[0] += 1
            flag = True
            
        if action == 'r' and self.current_position[1] != self.selected_map.shape[0]-1:
            self.current_position[1] += 1
            flag = True
            
        if action == 'l' and self.current_position[1] != 0:
            self.current_position[1] -= 1
            flag = True
            
        if self.getState() == 'g':
            return (self.current_position[0],self.current_position[1]),10,flag
        elif self.getState() == 'h':
            return (self.current_position[0],self.current_position[1]),-5,flag
        elif self.getState() == 'f' or self.getState() == 's':
            return (self.current_position[0],self.current_position[1]),0,flag
            
    def setPosition(self,x,y):
        self.current_position[0] = x
        self.current_position[1] = y
        
        
    def getState(self):
        return self.selected_map[self.current_position[0],self.current_position[1]].copy()

In [20]:
env = Environment('4x4')

In [21]:
policy = np.empty([4,4],dtype=object)
possible_actions = ['l','r','d','u']
for i in range(4):
    for j in range(4):
        policy[i,j] = possible_actions[np.random.randint(0,4)]
        
print(policy)

[['r' 'd' 'u' 'l']
 ['l' 'r' 'd' 'u']
 ['r' 'l' 'l' 'l']
 ['u' 'r' 'l' 'r']]


In [22]:
action_to_num = {
    'u':0,
    'd':1,
    'r':2,
    'l':3
}

num_to_action = {
    '0':'u',
    '1':'d',
    '2':'r',
    '3':'l'
}

In [26]:
def generateEpisode(env,policy,max_step):
    state = env.reset()
#     print(state)
    episode = []
    step = 0
    while True:
        if np.random.rand() < 0.85:
            action = policy[env.current_position[0],env.current_position[1]]
        else:
            action = num_to_action[str(np.random.randint(0,4))]
        new_state,reward,flag = env.step(action)
        
#         print(action,new_state,reward)
        episode.append((state,action,reward))
        state = new_state
        if reward == 10 or step == max_step:
            break
        step += 1
    return episode

In [27]:
# generateEpisode(env,policy,100)

In [28]:
def qEstimation(env,policy,num_episodes, gamma=0.9):
    
    G_value = defaultdict(lambda: np.zeros(4))
    number_of_visit = defaultdict(lambda: np.zeros(4))
    Q_value = defaultdict(lambda: np.zeros(4))
    for i_episode in range(1, num_episodes+1):
        episode = generateEpisode(env,policy,200)
        states,actions, rewards = zip(*episode)
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])

        for i, state in enumerate(states):
            G_value[state][action_to_num[actions[i]]] += sum(rewards[i:]*discounts[:-(1+i)])
            number_of_visit[state][action_to_num[actions[i]]] += 1.0
            Q_value[state][action_to_num[actions[i]]] = G_value[state][action_to_num[actions[i]]] / number_of_visit[state][action_to_num[actions[i]]]
            policy[state[0],state[1]] = num_to_action[str(np.argmax(Q_value[state]))]
            
    return Q_value

In [29]:
#random policy
policy

array([['r', 'd', 'u', 'l'],
       ['l', 'r', 'd', 'u'],
       ['r', 'l', 'l', 'l'],
       ['u', 'r', 'l', 'r']], dtype=object)

In [12]:
# ['s','f','f','h'],
# ['h','h','f','h'],
# ['h','f','f','h'],
# ['h','f','f','g']

In [31]:
#optimal policy
policy

array([['r', 'r', 'd', 'l'],
       ['r', 'r', 'd', 'l'],
       ['r', 'd', 'd', 'd'],
       ['r', 'r', 'r', 'd']], dtype=object)

In [30]:
q = qEstimation(env,policy,10000)


In [17]:
q

defaultdict(<function __main__.qEstimation.<locals>.<lambda>()>,
            {(1,
              1): array([ 4.10642925,  6.01689613,  5.83169033, -0.82044123]),
             (1,
              0): array([ 2.51632694, -0.71453171,  5.19659999, -0.33876131]),
             (0,
              0): array([ 2.25487279, -1.29797528,  3.33595925, -0.35599018]),
             (0,
              1): array([ 2.53965523,  5.10129709, -1.656336  ,  1.08246111]),
             (1,
              3): array([-11.62775646,   8.54218783,   2.49010122,   3.8274542 ]),
             (1,
              2): array([-0.18883771,  7.37425796, -0.1266861 ,  4.78441276]),
             (0,
              3): array([ -8.78711189,  -5.92484879, -39.85053756,  -0.98850581]),
             (0,
              2): array([-14.44978603,   6.09176988,  -7.27774317,   4.32610968]),
             (3,
              1): array([ 6.07357671, -0.06328608,  2.71228637, -0.23981143]),
             (3,
              2): array([ 3.69289997,  3.0