<a href="https://colab.research.google.com/github/alpacaYiChun/ML/blob/master/RL1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from abc import ABC, abstractmethod
import numpy as np
import random

class A2C(ABC):
    def __init__(self):
        pass

    @abstractmethod
    def get_value_of_state(self, state):
        pass

    @abstractmethod
    def choose_action(self, state):
        pass

    @abstractmethod
    def go(self):
        pass

    @abstractmethod
    def restart(self, init_state):
        pass

    def end(self):
        pass

class ShootEnv(ABC):
    def __init__(self):
        pass

    @abstractmethod
    def start(self, init_state):
        pass

    @abstractmethod
    def go(self, action):
        pass

class FiniteQ(A2C):
    def __init__(self, env, gamma, num_state, num_action, epsilon, alpha, init_state):
        super().__init__()
        self.env = env
        self.gamma = gamma
        self.num_state = num_state
        self.num_action = num_action
        self.eplison = epsilon
        self.alpha = alpha
        self.current_state = init_state
        self.q = np.zeros((num_state, num_action))
        self.total_reward = 0
        self.step = 0
        self.age = -1
        self.end = False

    def get_value_of_state(self, state):
        #print(self.q[state])
        return np.max(self.q[state])

    def choose_action(self, state):
        rand = np.random.random()
        threshold = pow(self.eplison, self.age)
        if rand < threshold:
            return random.randint(0, self.num_action - 1)
        else:    
            max_value = np.max(self.q[state])
            all_candidates = []
            for i in range(self.num_action):
                if self.q[state][i] == max_value:
                    all_candidates.append(i)
            return random.choice(all_candidates)

    def go(self):
        action = self.choose_action(self.current_state)
        reward, next_state, finished = self.env.go(action)
        print("action={}, result={}, reward={}".format(action, next_state, reward))
        new_q = 0
        if not finished:
            new_q = reward + self.gamma * self.get_value_of_state(next_state)
        else:
            new_q = reward
            self.end = True

        old_q = self.q[self.current_state][action]
        delta = new_q - old_q
        self.q[self.current_state][action] += self.alpha * delta

        self.total_reward += reward
        self.step += 1

        self.current_state = next_state

    def restart(self, init_state):
        self.current_state = init_state
        self.step = 0
        self.total_reward = 0
        self.end = False
        self.env.start(init_state)
        self.age += 1

    def end(self):
        return self.end

class MazeEnv(ShootEnv):
    def __init__(self, grid, goal):
        super().__init__()
        self.grid = grid
        self.m = len(grid)
        self.n = len(grid[0])
        self.goal = goal
        print("{},{}".format(self.m,self.n))
    
    def start(self, init_state):
        self.x = int(init_state / self.n)
        self.y = int(init_state % self.n)

    def go(self, action):
        reward = 0
        if action == 0: #up
            if self.x == 0: reward = -2
            elif self.grid[self.x-1][self.y] == 0: reward = -2
            else: 
                self.x = self.x - 1
        elif action == 1: #left
            if self.y == 0: reward = -2
            elif self.grid[self.x][self.y-1] == 0: reward = -2
            else: self.y = self.y - 1
        elif action == 2: #down
            if self.x == self.m-1: reward = -2
            elif self.grid[self.x+1][self.y] == 0: reward = -2
            else: self.x = self.x + 1
        elif 3: #right
            if self.y == self.n-1: reward = -2
            elif self.grid[self.x][self.y+1] == 0: reward = -2
            else: self.y = self.y + 1

        new_state = self.x * self.n + self.y

        finished = False

        if reward == 0:
            if new_state == self.goal:
                reward = 100
                finished = True
            else:
                reward = -1

        return reward, new_state, finished

grid = [
    [1, 0, 1, 1, 0],
    [1, 1, 1, 0, 1],
    [0 ,1, 0, 1, 1],
    [1, 1, 1, 1, 1]
]

env = MazeEnv(grid, 19)
agent = FiniteQ(env, 0.9, 20, 4, 0.5, 0.1, 0)
agent.restart(0)

num_episode = 20
for i in range(num_episode):
    while not agent.end:
        agent.go()
    print("After {} steps, total reward got as {}".format(agent.step, agent.total_reward))
    agent.restart(0)

4,5
action=0, result=0, reward=-2
action=3, result=0, reward=-2
action=3, result=0, reward=-2
action=0, result=0, reward=-2
action=1, result=0, reward=-2
action=1, result=0, reward=-2
action=3, result=0, reward=-2
action=1, result=0, reward=-2
action=1, result=0, reward=-2
action=2, result=5, reward=-1
action=2, result=5, reward=-2
action=1, result=5, reward=-2
action=2, result=5, reward=-2
action=1, result=5, reward=-2
action=1, result=5, reward=-2
action=3, result=6, reward=-1
action=2, result=11, reward=-1
action=2, result=16, reward=-1
action=0, result=11, reward=-1
action=2, result=16, reward=-1
action=0, result=11, reward=-1
action=3, result=11, reward=-2
action=2, result=16, reward=-1
action=3, result=17, reward=-1
action=3, result=18, reward=-1
action=0, result=13, reward=-1
action=1, result=13, reward=-2
action=0, result=13, reward=-2
action=2, result=18, reward=-1
action=2, result=18, reward=-2
action=1, result=17, reward=-1
action=2, result=17, reward=-2
action=0, result=17,