In [1]:
class MDP:

    """A Markov Decision Process, defined by an initial state, transition model,
    and reward function. We also keep track of a gamma value, for use by
    algorithms. The transition model is represented somewhat differently from
    the text. Instead of P(s' | s, a) being a probability number for each
    state/state/action triplet, we instead have T(s, a) return a
    list of (p, s') pairs. We also keep track of the possible states,
    terminal states, and actions for each state. [page 646]"""

    def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma=.9):
        if not (0 < gamma <= 1):
            raise ValueError("An MDP must have 0 < gamma <= 1")

        if states:
            self.states = states
        else:
            self.states = set()
        self.init = init
        self.actlist = actlist
        self.terminals = terminals
        self.transitions = transitions
        self.gamma = gamma
        self.reward = {}

    def R(self, state):
        """Return a numeric reward for this state."""
        return self.reward[state]

    def T(self, state, action):
        """Transition model. From a state and an action, return a list
        of (probability, result-state) pairs."""
        if(self.transitions == {}):
            raise ValueError("Transition model is missing")
        else:
            return self.transitions[state][action]

    def actions(self, state):
        """Set of actions that can be performed in this state. By default, a
        fixed list of actions, except for terminal states. Override this
        method if you need to specialize by state."""
        if state in self.terminals:
            return [None]
        else:
            return self.actlist

In [2]:
class CustomMDP(MDP):

    def __init__(self, transition_matrix, rewards, terminals, init, gamma=.9):
        # All possible actions.
        actlist = []
        for state in transition_matrix.keys():
            actlist.extend(transition_matrix[state])
        actlist = list(set(actlist))
        print(actlist)

        MDP.__init__(self, init, actlist, terminals=terminals, gamma=gamma)
        self.t = transition_matrix
        self.reward = rewards
        for state in self.t:
            self.states.add(state)

    def T(self, state, action):
        if action is None:
            return [(0.0, state)]
        else: 
            return [(prob, new_state) for new_state, prob in self.t[state][action].items()]
        
    def R(self, state, action):
        if action is None:
            return

In [5]:
class MDPext:
    
    def __init__(self, init, actlist, terminals, transitions={}, transition_rewards={}, states=None, gamma=.9):
        if not (0 < gamma <= 1):
            raise ValueError("An MDP must have 0 < gamma <= 1")

        if states:
            self.states = states
        else:
            self.states = set()
        self.init = init
        self.actlist = actlist
        self.terminals = terminals
        self.transitions = transitions
        self.transition_rewards = transition_rewards
        self.gamma = gamma
        self.reward = {}

    def R(self, state):
        """Return a numeric reward for this state."""
        return self.reward[state][action]
    
    def T_R(self, state, action):
        """Return a numeric reward for this state and this action"""
        if (self.transition_rewards == {}):
            raise ValueError("Rewards model is missing")
        else:
            return self.transition_rewards[state][action]

    def T(self, state, action):
        """Transition model. From a state and an action, return a list
        of (probability, result-state) pairs."""
        if(self.transitions == {}):
            raise ValueError("Transition model is missing")
        else:
            return self.transitions[state][action]

    def actions(self, state):
        """Set of actions that can be performed in this state. By default, a
        fixed list of actions, except for terminal states. Override this
        method if you need to specialize by state."""
        if state in self.terminals:
            return [None]
        else:
            return self.actlist

In [None]:
r = {
    'leisure': {
        'facebook': 
    }
}

In [6]:
class CustomMDPext(MDPext):
    
    def __init__(self, transition_matrix, transition_rewards, rewards, terminals, init, gamma=.9):
        actlist = []
        for state in transition_matrix.keys():
            actlist.extend(transition_matrix[state])
        actlist = list(set(actlist))
        print(actlist)
        
        MDPext.__init__(self, init, actlist, terminals=terminals, gamma=gamma)
        self.t = transition_matrix
        self.t_r = transition_rewards
        self.reward = rewards
        for state in self.t:
            self.state.add(state)
            
    def T(self, state, action):
        if action is None:
            return[(0.0, state)]
        else:
            return [(prob, new_state) for new_state, prob in self.t[state][action].items()]
        
    def T_R(self, state, action):
        return self.t[state][action]

In [7]:
t = {
    'leisure': {
                    'facebook': {'leisure':0.9, 'class1':0.1},
                    'quit': {'leisure':0.1, 'class1':0.9},
                    'study': {},
                    'sleep': {},
                    'pub': {}
               },
    'class1': {
                    'study': {'class2':0.6, 'leisure':0.4},
                    'facebook': {'class2':0.4, 'leisure':0.6},
                    'quit': {},
                    'sleep': {},
                    'pub': {}
              },
    'class2': {
                    'study': {'class3':0.5, 'end':0.5},
                    'sleep': {'end':0.5, 'class3':0.5},
                    'facebook': {},
                    'quit': {},
                    'pub': {},
              },
    'class3': {
                    'study': {'end':0.6, 'class1':0.08, 'class2':0.16, 'class3':0.16},
                    'pub': {'end':0.4, 'class1':0.12, 'class2':0.24, 'class3':0.24},
                    'facebook': {},
                    'quit': {},
                    'sleep': {}
              },
    'end': {}
}

In [8]:
t_r = {
    'leisure': {
        'facebook':-1,
        'quit':0,
        'study':0,
        'sleep':0,
        'pub':0
    },
    'class1': {
        'study':-2,
        'facebook':-1,
        'quit':0,
        'sleep':0,
        'pub':0
    },
    'class2': {
        'study':-2,
        'sleep':0,
        'facebook':0,
        'quit':0,
        'pub':0
    },
    'class3': {
        'study':10,
        'pub':1,
        'facebook':0,
        'quit':0,
        'sleep':0
    },
    'end': {
        'study':0,
        'pub':0,
        'facebook':0,
        'quit':0,
        'sleep':0
    }
}

In [9]:
rewards = {
    'class1': 4,
    'class2': 6,
    'class3': 10,
    'leisure': -1,
    'end': 0
}

In [10]:
terminals = ['end']

In [11]:
init = 'class1'

In [None]:
value_iteration_modified(mdp, epsilon=0.001):
    U1 = {s: 0 for s in mdp.states}
    T_R, R, T, gamma = mdp.T_R, mdp.R, mdp.T, mdp.gamma
    while True:
        U = U1.copy()
        delta = 0
        for s in mdp.states:
            for a in mdp.actions:

# To do: remove static rewards andonly allow dynamic (transition) rewards                

In [12]:
class dMDP:

    """A Markov Decision Process, defined by an initial state, transition model,
    and reward function. We also keep track of a gamma value, for use by
    algorithms. The transition model is represented somewhat differently from
    the text. Instead of P(s' | s, a) being a probability number for each
    state/state/action triplet, we instead have T(s, a) return a
    list of (p, s') pairs. We also keep track of the possible states,
    terminal states, and actions for each state. [page 646]"""

    def __init__(self, init, actlist, terminals, transitions={}, rewards={}, states=None, gamma=.9):
        if not (0 < gamma <= 1):
            raise ValueError("An MDP must have 0 < gamma <= 1")

        if states:
            self.states = states
        else:
            self.states = set()
        self.init = init
        self.actlist = actlist
        self.terminals = terminals
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma

    def R(self, state, action):
        """Return a numeric reward for this state."""
        if (self.rewards == {}):
            raise ValueError('Reward model is missing')
        else:
            return self.rewards[state][action]

    def T(self, state, action):
        """Transition model. From a state and an action, return a list
        of (probability, result-state) pairs."""
        if(self.transitions == {}):
            raise ValueError("Transition model is missing")
        else:
            return self.transitions[state][action]

    def actions(self, state):
        """Set of actions that can be performed in this state. By default, a
        fixed list of actions, except for terminal states. Override this
        method if you need to specialize by state."""
        if state in self.terminals:
            return [None]
        else:
            return self.actlist

In [13]:
class CustomdMDP(dMDP):
    
    def __init__(self, transition_matrix, rewards, terminals, init, gamma=.9):
        actlist = []
        for state in transition_matrix.keys():
            actlist.extend(transition_matrix[state])
        actlist = list(set(actlist))
        print(actlist)
        
        MDP.__init__(self, init, actlist, terminals=terminals, gamma=gamma)
        self.t = transition_matrix
        self.rewards = rewards
        for state in self.t:
            self.states.add(state)
            
            
    def T(self, state, action):
        if action is None:
            return [(0.0, state)]
        else:
            return [(prob, new_state) for new_state, prob in self.t[state][action].items()]
        
    def R(self, state, action):
        if action is None:
            return 0
        else:
            return self.rewards[state][action]

In [29]:
def value_iteration_dmdp(dmdp, epsilon=0.001):
    U1 = {s: 0 for s in dmdp.states}
    R, T, gamma = dmdp.R, dmdp.T, dmdp.gamma
    while True:
        U = U1.copy()
        delta = 0
        for s in dmdp.states:
            values = []
            for a in dmdp.actions(s):
                for (p, s1) in T(s, a):
                    print(R(s, a))
#                     values.append(sum(p*(R(s, a) + gamma*U(s1))))
#             U1[s] = max(values)
#             delta = max(delta, abs(U1[s] - U[s]))
#         if delta < epsilon * (1 - gamma) / gamma:
#             return U

In [15]:
t = {
    'leisure': {
                    'facebook': {'leisure':0.9, 'class1':0.1},
                    'quit': {'leisure':0.1, 'class1':0.9},
                    'study': {},
                    'sleep': {},
                    'pub': {}
               },
    'class1': {
                    'study': {'class2':0.6, 'leisure':0.4},
                    'facebook': {'class2':0.4, 'leisure':0.6},
                    'quit': {},
                    'sleep': {},
                    'pub': {}
              },
    'class2': {
                    'study': {'class3':0.5, 'end':0.5},
                    'sleep': {'end':0.5, 'class3':0.5},
                    'facebook': {},
                    'quit': {},
                    'pub': {},
              },
    'class3': {
                    'study': {'end':0.6, 'class1':0.08, 'class2':0.16, 'class3':0.16},
                    'pub': {'end':0.4, 'class1':0.12, 'class2':0.24, 'class3':0.24},
                    'facebook': {},
                    'quit': {},
                    'sleep': {}
              },
    'end': {}
}

In [16]:
r = {
    'leisure': {
        'facebook':-1,
        'quit':0,
        'study':0,
        'sleep':0,
        'pub':0
    },
    'class1': {
        'study':-2,
        'facebook':-1,
        'quit':0,
        'sleep':0,
        'pub':0
    },
    'class2': {
        'study':-2,
        'sleep':0,
        'facebook':0,
        'quit':0,
        'pub':0
    },
    'class3': {
        'study':10,
        'pub':1,
        'facebook':0,
        'quit':0,
        'sleep':0
    },
    'end': {
        'study':0,
        'pub':0,
        'facebook':0,
        'quit':0,
        'sleep':0
    }
}

In [17]:
terminals = ['end']

In [19]:
init = 'class1'

In [20]:
dmdp = CustomdMDP(t, r, terminals, init, gamma=.9)

['sleep', 'study', 'quit', 'pub', 'facebook']


In [None]:
value_iteration_dmdp(dmdp)

0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1
1
1
0
0
-2
-2
-2
-2
-1
-1
0
0
0
-1
-1
10
10
10
10
1
1


In [None]:
for s in 