In [1]:
from mdp import *
from notebook import psource, pseudocode

In [2]:
psource(MDP)

In [3]:
t = {
    'S0': {
        'a0': {'S0':0.7, 'S1':0.3},
        'a1': {'S0':1.0},
        'a2': {'S0':0.8, 'S1':0.2}
    },
    'S1': {
        'a0': {'S1':1.0},
        'a2': {'S2':1.0}
    },
    'S2': {
        'a1': {'S0':0.8, 'S1':0.1, 'S2':0.1}
    }
}

In [4]:
init = 'S0'
terminals = None
rewards = {
    'S0': 10,
    'S1': -2,
    'S2': 20
}

In [5]:
class CustomMDP(MDP):
    
    def __init__ (self, transition_matrix, rewards, terminals, init, gamma=0.9):
        # all possible actions
        actlist = []
        for state in transition_matrix.keys():
            actlist.extend(transition_matrix[state])
        actlist = list(set(actlist))
        
        MDP.__init__(self, init, actlist, terminals=terminals, gamma=gamma)
        self.t = transition_matrix
        self.reward = rewards
        for state in self.t:
            self.states.add(state)
    
    def T(self, state, action):
        if action is None:
            return [(0.0, state)]
        else:
            return [(prob, new_state) for new_state, prob in self.t[state][action].items()]

In [6]:
mdp = CustomMDP(t, rewards, terminals, init, gamma=.9)

In [7]:
mdp.actlist

['a2', 'a0', 'a1']

In [8]:
mdp.reward

{'S0': 10, 'S1': -2, 'S2': 20}

In [9]:
mdp.t

{'S0': {'a0': {'S0': 0.7, 'S1': 0.3},
  'a1': {'S0': 1.0},
  'a2': {'S0': 0.8, 'S1': 0.2}},
 'S1': {'a0': {'S1': 1.0}, 'a2': {'S2': 1.0}},
 'S2': {'a1': {'S0': 0.8, 'S1': 0.1, 'S2': 0.1}}}

In [10]:
mdp.states

{'S0', 'S1', 'S2'}

In [11]:
mdp.terminals

In [12]:
mdp.T('S0', 'a1')

[(1.0, 'S0')]

In [13]:
mdp.T('S0', None)

[(0.0, 'S0')]

In [14]:
# GridMDP
psource(GridMDP)

In [15]:
sequential_decision_environment

<mdp.GridMDP at 0x15c69e13710>

In [16]:
psource(value_iteration)

In [17]:
value_iteration(sequential_decision_environment)

{(0, 0): 0.2962883154554812,
 (0, 1): 0.3984432178350045,
 (0, 2): 0.5093943765842497,
 (1, 0): 0.25386699846479516,
 (1, 2): 0.649585681261095,
 (2, 0): 0.3447542300124158,
 (2, 1): 0.48644001739269643,
 (2, 2): 0.7953620878466678,
 (3, 0): 0.12987274656746342,
 (3, 1): -1.0,
 (3, 2): 1.0}

In [18]:
pseudocode('Value-Iteration')

URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

In [19]:
def value_iteration_instru(mdp, iterations=20):
    U_over_time = []
    U1 = {s: 0 for s in mdp.states}
    R, T, gamma = mdp.R, mdp.T, mdp.gamma
    for _ in range(iterations):
        U = U1.copy()
        for s in mdp.states:
            U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
                                        for a in mdp.actions(s)])
        U_over_time.append(U)
    return U_over_time

In [20]:
value_iteration_instru(sequential_decision_environment)

[{(0, 0): 0,
  (0, 1): 0,
  (0, 2): 0,
  (1, 0): 0,
  (1, 2): 0,
  (2, 0): 0,
  (2, 1): 0,
  (2, 2): 0,
  (3, 0): 0,
  (3, 1): 0,
  (3, 2): 0},
 {(0, 0): -0.04,
  (0, 1): -0.04,
  (0, 2): -0.04,
  (1, 0): -0.04,
  (1, 2): -0.04,
  (2, 0): -0.04,
  (2, 1): -0.04,
  (2, 2): -0.04,
  (3, 0): -0.04,
  (3, 1): -1.0,
  (3, 2): 1.0},
 {(0, 0): -0.07600000000000001,
  (0, 1): -0.07600000000000001,
  (0, 2): -0.07600000000000001,
  (1, 0): -0.07600000000000001,
  (1, 2): -0.07600000000000001,
  (2, 0): -0.07600000000000001,
  (2, 1): -0.07600000000000001,
  (2, 2): 0.6728000000000001,
  (3, 0): -0.07600000000000001,
  (3, 1): -1.0,
  (3, 2): 1.0},
 {(0, 0): -0.10840000000000002,
  (0, 1): -0.10840000000000002,
  (0, 2): -0.10840000000000002,
  (1, 0): -0.10840000000000002,
  (1, 2): 0.430736,
  (2, 0): -0.10840000000000002,
  (2, 1): 0.3475760000000001,
  (2, 2): 0.733712,
  (3, 0): -0.10840000000000002,
  (3, 1): -1.0,
  (3, 2): 1.0},
 {(0, 0): -0.13756000000000002,
  (0, 1): -0.13756000000000