In [1]:
# c_maze.py

# 미로 내 장애물 및 시작 상태, 종료 상태 정보등을 모두 지닌 미로 클래스
import random
import time


# -------------------------------------------------------------
# |     |     |     |     |     |     |     |  x  |  x  |     |
# |(0,0)|(0,1)|(0,2)|(0,3)|(0,4)|(0,5)|(0,6)|(0,7)|(0,8)|(0,9)
# -------------------------------------------------------------
# |     |     |     |     |  x  |  x  |     |     |     |     |
# |(1,0)|(1,1)|(1,2)|(1,3)|(1,4)|(1,5)|(1,6)|(1,7)|(1,8)|(1,9)
# -------------------------------------------------------------
# |     |  S  |  x  |     |  x  |  x  |     |     |  x  |     |
# |(2,0)|(2,1)|(2,2)|(2,3)|(2,4)|(2,5)|(2,6)|(2,7)|(2,8)|(2,9)
# -------------------------------------------------------------
# |     |     |  x  |     |     |     |     |     |  x  |     |
# |(3,0)|(3,1)|(3,2)|(3,3)|(3,4)|(3,5)|(3,6)|(3,7)|(3,8)|(3,9)
# -------------------------------------------------------------
# |     |     |  x  |     |     |     |     |  x  |  x  |     |
# |(4,0)|(4,1)|(4,2)|(4,3)|(4,4)|(4,5)|(4,6)|(4,7)|(4,8)|(4,9)
# -------------------------------------------------------------
# |     |     |     |     |     |     |     |  x  |  x  |  G  |
# |(5,0)|(5,1)|(5,2)|(5,3)|(5,4)|(5,5)|(5,6)|(5,7)|(5,8)|(5,9)
# -------------------------------------------------------------

class Maze():
	def __init__(self):
		# 미로의 가로 길이
		self.MAZE_WIDTH = 10

		# 미로의 세로 길이
		self.MAZE_HEIGHT = 6

		# 모든 가능한 행동
		self.ACTION_UP = 0
		self.ACTION_DOWN = 1
		self.ACTION_LEFT = 2
		self.ACTION_RIGHT = 3
		self.ACTIONS = [
			self.ACTION_UP,
			self.ACTION_DOWN,
			self.ACTION_LEFT,
			self.ACTION_RIGHT
		]

		# UP, DOWN, LEFT, RIGHT
		self.ACTION_SYMBOLS = ["↑", "↓", "←", "→"]
		self.NUM_ACTIONS = len(self.ACTIONS)

		# 시작 상태 위치
		self.START_STATE = (2, 1)

		# 종료 상태 위치
		self.GOAL_STATES = [(5, 9)]

		# 장애물들의 위치
		self.OBSTACLES = [
			(0, 7), (0, 8),
			(1, 4), (1, 5),
			(2, 2), (2, 4), (2, 5), (2, 8),
			(3, 2), (3, 8),
			(4, 2), (4, 7), (4, 8),
			(5, 7), (5, 8)
		]

		self.current_state = None

	def reset(self):
		self.current_state = self.START_STATE
		return self.current_state

	# take @action in @state
	# @return: [new state, reward]
	def step(self, action):
		x, y = self.current_state
		if action == self.ACTION_UP:
			x = max(x - 1, 0)
		elif action == self.ACTION_DOWN:
			x = min(x + 1, self.MAZE_HEIGHT - 1)
		elif action == self.ACTION_LEFT:
			y = max(y - 1, 0)
		elif action == self.ACTION_RIGHT:
			y = min(y + 1, self.MAZE_WIDTH - 1)

		if (x, y) in self.OBSTACLES:
			x, y = self.current_state

		if (x, y) in self.GOAL_STATES:
			reward = 1.0
		else:
			reward = 0.0

		self.current_state = (x, y)

		if self.current_state in self.GOAL_STATES:
			done = True
		else:
			done = False

		return (x, y), reward, done, None

	def render(self):
		print(self.__str__())

	def get_random_action(self):
		return random.choice(self.ACTIONS)

	def __str__(self):
		maze_str = ""
		for i in range(self.MAZE_HEIGHT):
			maze_str += "-------------------------------------------------------------\n"
			out = '| '
			for j in range(self.MAZE_WIDTH):
				if (i, j) == self.START_STATE:
					t = "S"
				elif (i, j) in self.GOAL_STATES:
					t = "G"
				elif self.current_state[0] == i and self.current_state[1] == j:
					t = "*"
				else:
					t = " " if (i, j) not in self.OBSTACLES else "x"
				out += str(" {0} ".format(t)) + ' | '
			maze_str += out + "\n"

			for j in range(self.MAZE_WIDTH):
				maze_str += "|({0},{1})".format(i, j)
			maze_str += "|\n"

		maze_str += "-------------------------------------------------------------\n"
		return maze_str

In [2]:
# b_random_walk.py

import random
import time

# -------------------------------
# T1 0 1 2 3 4 T2
# -------------------------------


class RandomWalk():
    def __init__(
            self,
            num_internal_states=5,        # 종료 상태를 제외한 내부 상태 개수
            transition_reward=0.0,        # 일반적인 상태 전이 보상
            left_terminal_reward=0.0,     # 왼쪽 종료 상태로 이동하는 행동
                                          # 수행 시 받는 보상
            right_terminal_reward=1.0     # 오른쪽 종료 상태로 이동하는 행동
                                          # 수행 시 받는 보상
    ):
        self.__version__ = "0.0.1"

        self.num_internal_states = num_internal_states

        self.num_states = num_internal_states + 2
        self.STATES = [i for i in range(num_internal_states)]
        self.TERMINAL_STATES = ['T1', 'T2']

        # 모든 가능한 행동
        self.ACTION_LEFT = 0
        self.ACTION_RIGHT = 1
        self.ACTION_SYMBOLS = ["\u2190", "\u2192"]

        # 종료 상태를 제외한 임의의 상태에서 왼쪽 이동 또는 오른쪽 이동
        self.ACTIONS = [
            self.ACTION_LEFT,
            self.ACTION_RIGHT
        ]
        self.NUM_ACTIONS = len(self.ACTIONS)

        # 시작 상태 위치
        self.START_STATE = self.STATES[int(num_internal_states / 2)]

        self.transition_reward = transition_reward

        self.left_terminal_reward = left_terminal_reward

        self.right_terminal_reward = right_terminal_reward

        self.current_state = None

    def reset(self):
        self.current_state = self.START_STATE
        return self.current_state

    def moveto(self, state):
        self.current_state = state

    def get_next_state(self, state, action):
        if state in self.TERMINAL_STATES:
            next_state = state
        else:
            if action == self.ACTION_LEFT:
                if state == 0:
                    next_state = 'T1'
                else:
                    next_state = state - 1
            elif action == self.ACTION_RIGHT:
                if state == self.num_internal_states - 1:
                    next_state = 'T2'
                else:
                    next_state = state + 1
            else:
                raise ValueError()

        return next_state

    def get_reward(self, state, next_state):
        if next_state == 'T1':
            reward = self.left_terminal_reward
        elif next_state == 'T2':
            reward = self.right_terminal_reward
        else:
            reward = self.transition_reward

        return reward

    def get_state_action_probability(self, state, action):
        next_state = self.get_next_state(state, action)

        reward = self.get_reward(state, next_state)
        prob = 1.0

        return next_state, reward, prob

    # take @action in @state
    # @return: (reward, new state)
    def step(self, action):
        next_state = self.get_next_state(
            state=self.current_state, action=action
        )

        reward = self.get_reward(self.current_state, next_state)

        self.current_state = next_state

        if self.current_state in self.TERMINAL_STATES:
            done = True
        else:
            done = False

        return next_state, reward, done, None

    def render(self, mode='human'):
        print(self.__str__(), end="\n\n")

    def get_random_action(self):
        return random.choice(self.ACTIONS)

    def __str__(self):
        randomwalk_str = ""
        randomwalk_str += " T1 " + " ".join(
            ["{0}".format(i) for i in range(self.num_internal_states)]
        ) + " T2\n"

        if self.current_state in self.STATES:
            blank = "    " + "  " * self.current_state
        elif self.current_state == 'T1':
            blank = " "
        elif self.current_state == 'T2':
            blank = "  " + "  " * (self.num_internal_states + 1)
        else:
            raise ValueError()

        randomwalk_str += blank + "*"

        return randomwalk_str

In [None]:
# b_random_walk.py

import random
import time

# -------------------------------
# T1 0 1 2 3 4 T2
# -------------------------------


class RandomWalk():
    def __init__(
            self,
            num_internal_states=5,        # 종료 상태를 제외한 내부 상태 개수
            transition_reward=0.0,        # 일반적인 상태 전이 보상
            left_terminal_reward=0.0,     # 왼쪽 종료 상태로 이동하는 행동
                                          # 수행 시 받는 보상
            right_terminal_reward=1.0     # 오른쪽 종료 상태로 이동하는 행동
                                          # 수행 시 받는 보상
    ):
        self.__version__ = "0.0.1"

        self.num_internal_states = num_internal_states

        self.num_states = num_internal_states + 2
        self.STATES = [i for i in range(num_internal_states)]
        self.TERMINAL_STATES = ['T1', 'T2']

        # 모든 가능한 행동
        self.ACTION_LEFT = 0
        self.ACTION_RIGHT = 1
        self.ACTION_SYMBOLS = ["\u2190", "\u2192"]

        # 종료 상태를 제외한 임의의 상태에서 왼쪽 이동 또는 오른쪽 이동
        self.ACTIONS = [
            self.ACTION_LEFT,
            self.ACTION_RIGHT
        ]
        self.NUM_ACTIONS = len(self.ACTIONS)

        # 시작 상태 위치
        self.START_STATE = self.STATES[int(num_internal_states / 2)]

        self.transition_reward = transition_reward

        self.left_terminal_reward = left_terminal_reward

        self.right_terminal_reward = right_terminal_reward

        self.current_state = None

    def reset(self):
        self.current_state = self.START_STATE
        return self.current_state

    def moveto(self, state):
        self.current_state = state

    def get_next_state(self, state, action):
        if state in self.TERMINAL_STATES:
            next_state = state
        else:
            if action == self.ACTION_LEFT:
                if state == 0:
                    next_state = 'T1'
                else:
                    next_state = state - 1
            elif action == self.ACTION_RIGHT:
                if state == self.num_internal_states - 1:
                    next_state = 'T2'
                else:
                    next_state = state + 1
            else:
                raise ValueError()

        return next_state

    def get_reward(self, state, next_state):
        if next_state == 'T1':
            reward = self.left_terminal_reward
        elif next_state == 'T2':
            reward = self.right_terminal_reward
        else:
            reward = self.transition_reward

        return reward

    def get_state_action_probability(self, state, action):
        next_state = self.get_next_state(state, action)

        reward = self.get_reward(state, next_state)
        prob = 1.0

        return next_state, reward, prob

    # take @action in @state
    # @return: (reward, new state)
    def step(self, action):
        next_state = self.get_next_state(
            state=self.current_state, action=action
        )

        reward = self.get_reward(self.current_state, next_state)

        self.current_state = next_state

        if self.current_state in self.TERMINAL_STATES:
            done = True
        else:
            done = False

        return next_state, reward, done, None

    def render(self, mode='human'):
        print(self.__str__(), end="\n\n")

    def get_random_action(self):
        return random.choice(self.ACTIONS)

    def __str__(self):
        randomwalk_str = ""
        randomwalk_str += " T1 " + " ".join(
            ["{0}".format(i) for i in range(self.num_internal_states)]
        ) + " T2\n"

        if self.current_state in self.STATES:
            blank = "    " + "  " * self.current_state
        elif self.current_state == 'T1':
            blank = " "
        elif self.current_state == 'T2':
            blank = "  " + "  " * (self.num_internal_states + 1)
        else:
            raise ValueError()

        randomwalk_str += blank + "*"

        return randomwalk_str