In [None]:
import numpy as np

# Markov Process Representation
class MP:
    # Initialization of MP
    # Input: state: set, 
    #        transition probability matrix: dictionary of dictionary of float
    def __init__(self, state, transition):
        self.state = state
        self.transition = transition

    # Set transition probability: P[s][s'] = P[S_t+1 = s'| S_t = s]
    def set_transition(self, s, s_prime, p_ss_prime)
        assert(s in state and s_prime in state), 'Wrong states'
        self.transition[s][s_prime]=p_ss_prime              
            
    # Get transition probability
    def get_transition(self, s, s_prime):
        assert(s in state and s_prime in state), 'Wrong states'
        return self.transition[s][s_prime]
    
    # Check the validity of transition probability 
    # The row sum of transition matrix should be 1
    def validation(self):  
        for s in state:
            p_s=0.0
            for s_prime in state:
                p_s = p_s + self.transition[s][s_prime]
            assert(p_s==1), 'Invalid transition probability matrix'

In [None]:
# Markov Reward Process
# Add on data structures based on MP
class MRP(MP):
    
    # Initialization of MRP 
    # Input: state: set
    #        transition: dictionary of dictionary of float
    #        reward: dictionary of dictionary of float
    #        gamma (discounted rate): float
    def __init__(self, state, transition, R, gamma):
        super.__init__(state,transition)
        self.R = R
        self.gamma = gamma
        self.R_s={}

    # Set reward: R[s][s']
    def set_reward(self, s, s_prime, r_ss_prime):
        assert(s in state and s_prime in state), 'Wrong states'
        self.R[s][s_prime] = r_ss_prime             
    
    # Get reward 
    def get_reward(self, s, s_prime):
        assert(s in state and s_prime in state), 'Wrong states'        
        return self.R[s][s_prime]           

    # Calculate reward: R_s = E[R_t+1| S_t = s]
    def compute_reward(self):
        for s in state:
            R_t1 = 0
            for s_prime in state:                
                R_t1 += self.get_transition(s, s_prime)*self.get_reward(s, s_prime)
            self.R_s[s] = R_t1

In [None]:
# Markov Decision Process
# Add on data structures based on MRP
class MDP(MRP):
    
    # Initialization of MDP 
    # Input: state: set
    #        action: set
    #        transition: dictionary of dictionary of float
    #        reward: dictionary of dictionary of dictionary of float
    #        gamma (discounted rate): float 
    def __init__(self, state, action, transition, R, gamma):
        super.__init__(state, transition, R, gamma)
        self.action = action
        self.reward = {{{}}}
        self.R_s_a = {{}}
        self.t_s_a = {{{}}}
    
    # Set transition probability
    def set_t_s_a(self, s, s_prime, pi)
        assert(s in state and s_prime in state and a in action), 'Wrong states or action'
        self.t_s_a[s][a][s_prime]= pi*self.get_transition(s,s_prime)       
             
    # Get transition probability
    def get_t_s_a(self, s, s_prime):
        assert(s in state and s_prime in state and a in action), 'Wrong states or action'
        return self.transition[s][a][s_prime]

    # Set reward: R[s][a][s']
    def set_reward_action(self, s, action, s_prime, pi):
        assert(s in state and s_prime in state and a in action), 'Wrong states or action'
        self.reward[s][a][s_prime] = pi*self.get_reward(s,s_prime)
    
    # Get reward:
    def get_reward_action(self, s, action, s_prime):
        assert(s in state and s_prime in state and a in action), 'Wrong states or action'
        return reward[s][a][s_prime]
    
    # Calculate R(s,a) = \sum_{s'} p(s,s',a) * r(s,s',a) 
    def compute_R_s_a(self):
        for s in state:
            for a in action:
                R_t1 = 0
                for s_prime in state:
                    R_t1 += self.get_t_s_a(s, a, s_prime)*self.get_reward(s, a, s_prime)
                self.R_s_a[s][a] = R_t1