In [1]:
import sys
import gym
import numpy as np
from collections import defaultdict
import pdb
from Environment import *

In [2]:
env = Environment('4x4')

In [3]:
policy = np.empty([4,4],dtype=object)
possible_actions = ['l','r','d','u']
for i in range(4):
    for j in range(4):
        policy[i,j] = possible_actions[np.random.randint(0,4)]
        
print(policy)

[['d' 'l' 'r' 'u']
 ['l' 'r' 'r' 'l']
 ['r' 'u' 'r' 'd']
 ['r' 'u' 'l' 'u']]


In [4]:
action_to_num = {
    'u':0,
    'd':1,
    'r':2,
    'l':3
}

num_to_action = {
    '0':'u',
    '1':'d',
    '2':'r',
    '3':'l'
}

In [5]:
def generateEpisode(env,policy,max_step):
    state = env.reset()
#     print(state)
    episode = []
    step = 0
    while True:
        if np.random.rand() < 0.85:
            action = policy[env.current_position[0],env.current_position[1]]
        else:
            action = num_to_action[str(np.random.randint(0,4))]
        new_state,reward,flag = env.step(action)
        
#         print(action,new_state,reward)
        episode.append((state,action,reward))
        state = new_state
        if reward == 10 or step == max_step:
            break
        step += 1
    return episode

In [6]:
# generateEpisode(env,policy,100)

In [7]:
def qEstimation(env,policy,num_episodes, gamma=0.9):
    
    G_value = defaultdict(lambda: np.zeros(4))
    number_of_visit = defaultdict(lambda: np.zeros(4))
    Q_value = defaultdict(lambda: np.zeros(4))
    for i_episode in range(1, num_episodes+1):
        episode = generateEpisode(env,policy,200)
        states,actions, rewards = zip(*episode)
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])

        for i, state in enumerate(states):
            G_value[state][action_to_num[actions[i]]] += sum(rewards[i:]*discounts[:-(1+i)])
            number_of_visit[state][action_to_num[actions[i]]] += 1.0
            Q_value[state][action_to_num[actions[i]]] = G_value[state][action_to_num[actions[i]]] / number_of_visit[state][action_to_num[actions[i]]]
            policy[state[0],state[1]] = num_to_action[str(np.argmax(Q_value[state]))]
            
    return Q_value

In [8]:
#random policy
policy

array([['d', 'l', 'r', 'u'],
       ['l', 'r', 'r', 'l'],
       ['r', 'u', 'r', 'd'],
       ['r', 'u', 'l', 'u']], dtype=object)

In [9]:
# ['s','h','h','g']
# ['f','h','h','f']
# ['f','h','h','f']
# ['f','f','f','f']

In [12]:
#optimal policy
policy

array([['d', 'l', 'r', 'u'],
       ['d', 'l', 'r', 'u'],
       ['d', 'd', 'r', 'u'],
       ['r', 'r', 'r', 'u']], dtype=object)

In [11]:
q = qEstimation(env,policy,10000)


In [13]:
q

defaultdict(<function __main__.qEstimation.<locals>.<lambda>()>,
            {(0,
              3): array([10.        ,  8.90357143, 10.        ,  3.4       ]),
             (2,
              1): array([-5.11873176, -4.98053199,  0.20034495,  3.2048366 ]),
             (3,
              1): array([-5.48283367,  3.5542684 ,  5.02188   ,  3.58739383]),
             (1,
              1): array([-6.73585798, -3.09664833,  1.64519636,  0.45443602]),
             (1,
              2): array([ 1.45214528,  0.18459389,  8.53904698, -3.41593286]),
             (2,
              2): array([ 1.36546244,  2.65274548,  6.71065313, -4.59037577]),
             (3,
              2): array([-0.63843871, -0.24039239,  6.08043511,  1.26824004]),
             (3, 3): array([7.33719087, 6.08613732, 6.11427245, 1.31857744]),
             (2, 3): array([8.55353941, 4.10877862, 4.83865058, 0.72970952]),
             (1,
              3): array([10.        ,  7.2913994 ,  8.41355732,  1.40771396]),
           