In [6]:
import numpy as np
import sys
from io import StringIO
from typing import Tuple
from gym.envs.toy_text.discrete import DiscreteEnv
import time
from collections import defaultdict

In [2]:
# Environment
# Feel free to play around with different setups

LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3


class GridCore(DiscreteEnv):
    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self, shape: Tuple[int] = (5, 10), start: Tuple[int] = (0, 0),
                 goal: Tuple[int] = (0, 9), max_steps: int = 300,
                 percentage_reward: bool = False, no_goal_rew: bool = False):
        try:
            self.shape = self._shape
        except AttributeError:
            self.shape = shape
        self.nS = np.prod(self.shape, dtype=int)  # type: int
        self.nA = 4
        self.start = start
        self.goal = goal
        self.max_steps = max_steps
        self._steps = 0
        self._pr = percentage_reward
        self._no_goal_rew = no_goal_rew
        self.total_steps = 0

        P = self._init_transition_probability()

        # We always start in state (3, 0)
        isd = np.zeros(self.nS)
        isd[np.ravel_multi_index(start, self.shape)] = 1.0

        super(GridCore, self).__init__(self.nS, self.nA, P, isd)

    def step(self, a):
        self._steps += 1
        s, r, d, i = super(GridCore, self).step(a)
        if self._steps >= self.max_steps:
            d = True
            i['early'] = True
        self.total_steps += 1
        return s, r, d, i

    def reset(self):
        self._steps = 0
        return super(GridCore, self).reset()

    def _init_transition_probability(self):
        raise NotImplementedError

    def _check_bounds(self, coord):
        coord[0] = min(coord[0], self.shape[0] - 1)
        coord[0] = max(coord[0], 0)
        coord[1] = min(coord[1], self.shape[1] - 1)
        coord[1] = max(coord[1], 0)
        return coord

    def print_T(self):
        print(self.P[self.s])

    def map_output(self, s, pos):
        if self.s == s:
            output = " x "
        elif pos == self.goal:
            output = " T "
        else:
            output = " o "
        return output

    def map_control_output(self, s, pos):
        return self.map_output(s, pos)

    def map_with_inbetween_goal(self, s, pos, in_between_goal):
        return self.map_output(s, pos)

    def render(self, mode='human', close=False, in_control=None, in_between_goal=None):
        self._render(mode, close, in_control, in_between_goal)

    def _render(self, mode='human', close=False, in_control=None, in_between_goal=None):
        if close:
            return
        outfile = StringIO() if mode == 'ansi' else sys.stdout
        if mode == 'human':
            print('\033[2;0H')

        for s in range(self.nS):
            position = np.unravel_index(s, self.shape)
            # print(self.s)
            if in_control:
                output = self.map_control_output(s, position)
            elif in_between_goal:
                output = self.map_with_inbetween_goal(s, position, in_between_goal)
            else:
                output = self.map_output(s, position)
            if position[1] == 0:
                output = output.lstrip()
            if position[1] == self.shape[1] - 1:
                output = output.rstrip()
                output += "\n"
            outfile.write(output)
        outfile.write("\n")
        if mode == 'human':
            if in_control:
                time.sleep(0.2)
            else:
                time.sleep(0.05)


class FallEnv(GridCore):
    _pits = []

    def __init__(self, **kwargs):
        super(FallEnv, self).__init__(**kwargs)

    def _calculate_transition_prob(self, current, delta, prob):
        transitions = []
        for d, p in zip(delta, prob):
            new_position = np.array(current) + np.array(d)
            new_position = self._check_bounds(new_position).astype(int)
            new_state = np.ravel_multi_index(tuple(new_position), self.shape)
            reward = 0.0
            is_done = False
            if tuple(new_position) == self.goal:
                if self._pr:
                    reward = 1 - (self._steps / self.max_steps)
                elif not self._no_goal_rew:
                    reward = 1.0
                is_done = True
            elif new_state in self._pits:
                reward = -1.
                is_done = True
            transitions.append((p, new_state, reward, is_done))
        return transitions

    def _init_transition_probability(self):
        self.afp = 0.  # todo: hotfix, check with Andre how to properly remove afp
        for idx, p in enumerate(self._pits):
            self._pits[idx] = np.ravel_multi_index(p, self.shape)
        # Calculate transition probabilities
        P = {}
        for s in range(self.nS):
            position = np.unravel_index(s, self.shape)
            P[s] = {a: [] for a in range(self.nA)}
            other_prob = self.afp / 3.

            tmp = [[UP, DOWN, LEFT, RIGHT],
                   [DOWN, LEFT, RIGHT, UP],
                   [LEFT, RIGHT, UP, DOWN],
                   [RIGHT, UP, DOWN, LEFT]]
            tmp_dirs = [[[-1, 0], [1, 0], [0, -1], [0, 1]],
                        [[1, 0], [0, -1], [0, 1], [-1, 0]],
                        [[0, -1], [0, 1], [-1, 0], [1, 0]],
                        [[0, 1], [-1, 0], [1, 0], [0, -1]]]
            tmp_pros = [[1 - self.afp, other_prob, other_prob, other_prob],
                        [1 - self.afp, other_prob, other_prob, other_prob],
                        [1 - self.afp, other_prob, other_prob, other_prob],
                        [1 - self.afp, other_prob, other_prob, other_prob], ]
            for acts, dirs, probs in zip(tmp, tmp_dirs, tmp_pros):
                P[s][acts[0]] = self._calculate_transition_prob(position, dirs, probs)
        return P

    def map_output(self, s, pos):
        if self.s == s:
            output = " \u001b[33m*\u001b[37m "
        elif pos == self.goal:
            output = " \u001b[37mX\u001b[37m "
        elif s in self._pits:
            output = " \u001b[31m.\u001b[37m "
        else:
            output = " \u001b[30mo\u001b[37m "
        return output

    def map_control_output(self, s, pos):
        if self.s == s:
            return " \u001b[34m*\u001b[37m "
        else:
            return self.map_output(s, pos)

    def map_with_inbetween_goal(self, s, pos, in_between_goal):
        if s == in_between_goal:
            return " \u001b[34mx\u001b[37m "
        else:
            return self.map_output(s, pos)


class Bridge6x10Env(FallEnv):
    _pits = [[0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7],
             [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7],
             [4, 2], [4, 3], [4, 4], [4, 5], [4, 6], [4, 7],
             [5, 2], [5, 3], [5, 4], [5, 5], [5, 6], [5, 7]]
    _shape = (6, 10)


class Pit6x10Env(FallEnv):
    _pits = [[0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7],
             [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7],
             [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7]]
             # [3, 2], [3, 3], [3, 4], [3, 5], [3, 6], [3, 7]]
    _shape = (6, 10)


class ZigZag6x10(FallEnv):
    _pits = [[0, 2], [0, 3],
             [1, 2], [1, 3],
             [2, 2], [2, 3],
             [3, 2], [3, 3],
             [5, 7], [5, 6],
             [4, 7], [4, 6],
             [3, 7], [3, 6],
             [2, 7], [2, 6],
             ]
    _shape = (6, 10)


class ZigZag6x10H(FallEnv):
    _pits = [[0, 2], [0, 3],
             [1, 2], [1, 3],
             [2, 2], [2, 3],
             [3, 2], [3, 3],
             [5, 7], [5, 6],
             [4, 7], [4, 6],
             [3, 7], [3, 6],
             [2, 7], [2, 6],
             [4, 4], [5, 2]
             ]
    _shape = (6, 10)


In [3]:
# This can be solved in other ways, as seen in the previous week
# Creating a function as policy, however, can be easier when using e.g. an epsilon greedy approach
def make_policy_function(q_function, epsilon, num_actions):
    
    def policy_fn(observation):
        policy = np.ones(num_actions) * epsilon / num_actions
        # random choice is used for tie breaking
        best_action = np.random.choice(np.flatnonzero(q_function[observation] == q_function[observation].max()))
        policy[best_action] += (1 - epsilon)
        return policy

    return policy_fn

In [4]:
# One update to the Q function
def td_update(q_function, state, action, reward, next_state, done, action_, gamma=0.9, alpha=0.1):
    td_target = reward + gamma * q_function[next_state][action_]
    if not done:
        td_delta = td_target - q_function[state][action]
    else:
        td_delta = td_target
    return q_function[state][action] + alpha * td_delta

In [67]:
environment = FallEnv()

In [68]:
# Empty Q function as dictionary
Q = defaultdict(lambda: np.zeros(environment.action_space.n))

In [69]:
def run_episode(env, q_function, verbose=False):
    policy = make_policy_function(q_function, 0.1, env.action_space.n)
    state = env.reset()
    done = False
    reward = 0
    steps = 0
    # Choose an action with probabilities given by the policy
    # This could also be done by the policy function if you wanted
    action = np.random.choice(list(range(env.action_space.n)), p=policy(state))
    while not done:
        next_state, r, done, _ = env.step(action)
        next_action = np.random.choice(list(range(env.action_space.n)), p=policy(next_state))
        reward +=r
        old_q = q_function[state][action]
        # Update Q function
        q_function[state][action] = td_update(Q, state, action, r, next_state, done, next_action)
        new_q = q_function[state][action]
        if verbose:
            env.render()
            print(f"Reward is {r}")
            print(f"Q value updated by {new_q-old_q}")
            print(f"New Q value is {new_q}")
        state = next_state
        action = next_action
        steps += 1
    print(f"Episode finished in {steps} steps, reward is {reward}")

In [70]:
run_episode(environment, Q, verbose=True)

[2;0H
[33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

In [71]:
for _ in range(10):
    run_episode(environment, Q)

Episode finished in 300 steps, reward is 0.0
Episode finished in 300 steps, reward is 0.0
Episode finished in 66 steps, reward is 1.0
Episode finished in 132 steps, reward is 1.0
Episode finished in 116 steps, reward is 1.0
Episode finished in 77 steps, reward is 1.0
Episode finished in 30 steps, reward is 1.0
Episode finished in 57 steps, reward is 1.0
Episode finished in 156 steps, reward is 1.0
Episode finished in 70 steps, reward is 1.0


In [72]:
run_episode(environment, Q, verbose=True)

[2;0H
[33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.0
New Q value is 0.0
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[

Reward is 1.0
Q value updated by 0.09999999999999998
New Q value is 0.9999999999999999
Episode finished in 24 steps, reward is 1.0


In [73]:
for _ in range(10):
    run_episode(environment, Q)

Episode finished in 23 steps, reward is 1.0
Episode finished in 15 steps, reward is 1.0
Episode finished in 88 steps, reward is 1.0
Episode finished in 43 steps, reward is 1.0
Episode finished in 12 steps, reward is 1.0
Episode finished in 11 steps, reward is 1.0
Episode finished in 11 steps, reward is 1.0
Episode finished in 14 steps, reward is 1.0
Episode finished in 13 steps, reward is 1.0
Episode finished in 13 steps, reward is 1.0


In [74]:
run_episode(environment, Q, verbose=True)

[2;0H
[30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 1.1093898829602157e-07
New Q value is 2.057235972935066e-07
[2;0H
[30mo[37m  [30mo[37m  [33m*[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  

Reward is 0.0
Q value updated by 0.009639000000000002
New Q value is 0.009639000000000002
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [33m*[37m  [37mX[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m

Reward is 0.0
Q value updated by 0.16929000000000002
New Q value is 0.27639
[2;0H
[30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [30mo[37m  [33m*[37m
[30mo[37m  [30mo[37m  [30mo[37m  [30mo