In [52]:
# https://towardsdatascience.com/introduction-to-various-reinforcement-learning-algorithms-i-q-learning-sarsa-dqn-ddpg-72a5e0cb6287

from copy import deepcopy

BOARD_WIDTH = 3
BOARD_HEIGHT = 3

ZOMBIE = 'z'
ICECREAM = 'i'
EMPTY = '_'
CAR = 'c'

UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3
ACTIONS = [UP, DOWN, LEFT, RIGHT]

class State:
    def __init__(self, state = None, car_pos = None):
        if state is None:
            self.state = self.initial_state()
        else:
            self.state = state

        assert len(self.state) == BOARD_HEIGHT
        assert len(self.state[0]) == BOARD_WIDTH

        if car_pos is None:
            self.car_pos = [2, 2]
        else:
            self.car_pos = car_pos

        self.state[self.car_pos[0]][self.car_pos[1]] = CAR

    def __repr__(self):
        return '\n'.join([' '.join(s) for s in self.state])
        
    def __eq__(self, other):
        return isinstance(other, State) and self.state == other.state and self.car_pos == other.car_pos
    
    def __hash__(self):
        return hash(str(self.state) + str(self.car_pos))
        
    def initial_state(self):
        return [
            [EMPTY,  ICECREAM, EMPTY ],
            [EMPTY,  ZOMBIE,   ZOMBIE],
            [EMPTY,  EMPTY,    EMPTY ],
        ]

def act(state, action):
    new_car_pos = deepcopy(state.car_pos)

    if action == UP:
        new_car_pos[0] = max(new_car_pos[0] - 1, 0)
    elif action == DOWN:
        new_car_pos[0] = min(new_car_pos[0] + 1, BOARD_HEIGHT - 1)
    elif action == LEFT:
        new_car_pos[1] = max(new_car_pos[1] - 1, 0)
    elif action == RIGHT:
        new_car_pos[1] = min(new_car_pos[1] + 1, BOARD_WIDTH - 1)
    else:
        raise Exception(f"Unknown action {action}")

    item = state.state[new_car_pos[0]][new_car_pos[1]]
    
    if item == ZOMBIE:
        finished = True
        reward = -100
    elif item == ICECREAM:
        finished = True
        reward = 1000
    elif item == CAR:
        finished = False
        reward = -1
    elif item == EMPTY:
        finished = False
        reward = 0
    else:
        raise Exception(f"Unknown item {item}")

    new_state = deepcopy(state.state)
    new_state[state.car_pos[0]][state.car_pos[1]] = EMPTY
    
    return (State(new_state, new_car_pos), reward, finished)


s = State()
print(s)
print()
(s, reward, finished) = act(s, LEFT)
print(s)
print()
print(reward)
print(finished)

_ i _
_ z z
_ _ c

_ i _
_ z z
_ c _

0
False


In [63]:
import numpy as np
import random

random.seed(42) # for reproducibility

N_STATES = BOARD_WIDTH * BOARD_HEIGHT
N_EPISODES = 100

MAX_EPISODE_STEPS = 20

MIN_ALPHA = 0.02

alphas = np.linspace(1.0, MIN_ALPHA, N_EPISODES)
gamma = 1.0
eps = 0.2

q_table = dict()

def q(state, action=None):
    if state not in q_table:
        q_table[state] = np.zeros(len(ACTIONS))

    if action is None:
        return q_table[state]
    else:
        return q_table[state][action]
    
def select_action(state):
    if random.uniform(0, 1) < eps:
        return random.choice(ACTIONS)
    else:
        return np.argmax(q(state))

for episode in range(N_EPISODES):
    alpha = alphas[episode]

    state = State()
    total_reward = 0
    
    for step in range(MAX_EPISODE_STEPS):
        action = select_action(state)
        (next_state, reward, finished) = act(state, action)
        total_reward += reward
        
        q(state)[action] = q(state, action) + \
                alpha * (reward + gamma * np.max(q(next_state)) - q(state, action))
        state = next_state
        
        if finished:
            break;
            
        state = next_state
            
    

In [70]:
q_table

{_ i _
 _ z z
 _ _ c: array([-100.        ,  985.06682263, 1000.        ,  997.24436367]),
 _ i _
 _ z c
 _ _ _: array([0., 0., 0., 0.]),
 _ i _
 _ z z
 _ c _: array([ -99.99319012,  930.28312257, 1000.        ,  804.0024769 ]),
 _ i _
 _ c z
 _ _ _: array([0., 0., 0., 0.]),
 _ i _
 _ z z
 c _ _: array([1000.        ,  995.49240845,  734.69157025,  564.19748783]),
 _ i _
 c z z
 _ _ _: array([1000.        ,  451.25599429,  995.84493831,  -99.99490517]),
 c i _
 _ z z
 _ _ _: array([ 989.26600403,  952.06598488,  987.87413408, 1000.        ]),
 _ c _
 _ z z
 _ _ _: array([0., 0., 0., 0.])}

In [69]:
state = State()
finished = False

while not finished:
    print(state)
    print('')
    action = np.argmax(q_table[state])
    (state, reward, finished) = act(state, action)
    
print(state)



_ i _
_ z z
_ _ c

_ i _
_ z z
_ c _

_ i _
_ z z
c _ _

_ i _
c z z
_ _ _

c i _
_ z z
_ _ _

_ c _
_ z z
_ _ _
