In [1]:
from copy import deepcopy
import numpy as np
import random

In [4]:
CAT = "c"
DOG = "d"
MOUSE = "m"
EMPTY = "emp"

field = [[MOUSE, EMPTY],
         [DOG, CAT]]

def showField():
    for row in field:
        print(" ".join(row))

In [5]:
showField()

m emp
d c


**Why is the _ _name_ _ variable used?**

The __name__ variable (two underscores before and after) is a special Python variable. It gets its value depending on how we execute the containing script.

Sometimes you write a script with functions that might be useful in other scripts as well. In Python, you can import that script as a module in another script.

Thanks to this special variable, you can decide whether you want to run the script. Or that you want to import the functions defined in the script.

In [21]:
class State:
    def __init__(self, field, catP):
        self.field = field
        self.catP = catP #Cat position
    
    def __eq__(self, other):
        return isinstance(other, State) and self.field == other.field and self.catP == other.catP

    def __hash__(self):
        return hash(str(self.field) + str(self.catP))

    def __str__(self):
        return f"State(feld={self.field}, cat_pos={self.catP}" 

# actions:
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3

ACTIONS = [UP, DOWN, LEFT, RIGHT]

iniState = State(field=field, catP=[1, 1])

print(iniState.__hash__())
print(iniState.__eq__(5))
newField = deepcopy(iniState.field)
print(newField)
print(iniState.catP)

-4512076517437852602
False
[['m', 'emp'], ['d', 'c']]
[1, 1]


In [18]:
# choose action to do
def act(state, action):
    
    def newCatPosition(state, action):
        p = deepcopy(state.catP)
        if action == UP:
            p[0] = max(0, p[0] - 1)
        elif action == DOWN:
            p[0] = min(len(state.field) - 1, p[0]+1)
        elif action == LEFT:
            p[1] = max(0, p[1] - 1)
        elif action == RIGHT:
            p[1] = min(len(state.field) - 1, p[1] + 1)
        else:
            raise ValueError(f"Unkown action {action}")

        return p


    p = newCatPosition(state, action)
    fieldItem = state.field[p[0]][p[1]] #Define who lives in the new cat position 
    newField = deepcopy(state.field)

    if fieldItem == DOG:
        reward = -100
        isDone = True # The dog has killed the cat
        newField[p[0]][p[1]] += CAT
    elif fieldItem == MOUSE:
        reward = 1000
        isDone = True # The cat has eaten the mouse
        newField[p[0]][p[1]] += CAT
    elif fieldItem == EMPTY:
        reward = -1
        isDone = False # Nothing has changed
        old = state.catP
        newField[old[0]][old[1]] = EMPTY
        newField[p[0]][p[1]] = CAT
    elif fieldItem == CAT:
        reward = -1
        isDone = False # Nothing has changed
    else:
        raise ValueError(f"Unknown field item {fieldItem}")
    
    return State(field=newField, catP=p), reward, isDone

In [22]:
# Just an example of how np.linspace works
alphas = np.linspace(1.0, MIN_ALPHA, N_EPISODES)
alphas

array([1.        , 0.94842105, 0.89684211, 0.84526316, 0.79368421,
       0.74210526, 0.69052632, 0.63894737, 0.58736842, 0.53578947,
       0.48421053, 0.43263158, 0.38105263, 0.32947368, 0.27789474,
       0.22631579, 0.17473684, 0.12315789, 0.07157895, 0.02      ])

In [23]:
random.seed(42) # for reproducibility

N_STATES = 4
N_EPISODES = 20

MAX_EPISODE_STEPS = 100

MIN_ALPHA = 0.02

alphas = np.linspace(1.0, MIN_ALPHA, N_EPISODES)
gamma = 1.0
eps = 0.2

q_table = dict()

In [24]:
def bellmann(state, action = None):
    if state not in q_table:
        q_table[state] = np.zeros(len(ACTIONS)) # the step number is a key, the value is an array of actions
    if action is None:
        return q_table[state]
    return q_table[state][action]

def selectAction(state): # why do we need that?
    if random.uniform(0, 1) < eps:
        return random.choice(ACTIONS)
    else:
        return np.argmax(bellmann(state))

In [19]:

r = bellmann(iniState)
print(f"up={r[UP]}, down={r[DOWN]}, left={r[LEFT]}, right={r[RIGHT]}")


for e in range(N_EPISODES):
    state = iniState
    total_reward = 0
    alpha = alphas[e]
    counter = 0
    for _ in range(MAX_EPISODE_STEPS):
        action = selectAction(state)
        next_state, reward, done = act(state, action) # we know the next state, 
        # all we need is to set new values in Q-table
        total_reward += reward
        
        bellmann(state)[action] = bellmann(state, action) + \
                alpha * (reward + gamma *  np.max(bellmann(next_state)) - bellmann(state, action))
        state = next_state
        counter += 1
        if done:
            break
    print(f"Episode {e + 1}: total reward -> {total_reward}, steps: {counter}")

up=0.0, down=0.0, left=0.0, right=0.0
Episode 1: total reward -> 999, steps: 2
Episode 2: total reward -> 998, steps: 3
Episode 3: total reward -> 997, steps: 4
Episode 4: total reward -> 997, steps: 4
Episode 5: total reward -> 999, steps: 2
Episode 6: total reward -> 999, steps: 2
Episode 7: total reward -> 998, steps: 3
Episode 8: total reward -> -100, steps: 1
Episode 9: total reward -> -101, steps: 2
Episode 10: total reward -> 999, steps: 2
Episode 11: total reward -> 999, steps: 2
Episode 12: total reward -> 999, steps: 2
Episode 13: total reward -> 999, steps: 2
Episode 14: total reward -> 999, steps: 2
Episode 15: total reward -> 999, steps: 2
Episode 16: total reward -> 998, steps: 3
Episode 17: total reward -> 999, steps: 2
Episode 18: total reward -> 999, steps: 2
Episode 19: total reward -> 999, steps: 2
Episode 20: total reward -> 999, steps: 2
