In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [28]:
class MDP():
    def __init__(self, n_s, n_a, alpha):
        self.n_a = n_a
        self.n_s = n_s
        self.p = self.trasition_prob_matrix()
        self.r = self.reward_distribution(0,1)
        self.state_space = np.arange(n_s)
        self.action_space = np.arange(n_a)
        self.state = None
        self.alpha = alpha
       
    def trasition_prob_matrix(self):
        k = 4
        pts = np.random.uniform(1,1000,(self.n_s,self.n_a,k))
        tpm = np.zeros((self.n_s,self.n_a,self.n_s))
        t = np.arange(0,1,1/self.n_s)
        for s in range(self.n_s):
            for a in range(self.n_a):
                tpm[s,a] = np.array([sum([pts[s,a,pt]*np.exp(-(k*x - (pt+0.5))**2)
                                           for pt in range(k)]) for x in t])
                tpm[s,a] /= sum(tpm[s,a])
                tpm[s,a] = np.clip(tpm[s,a], 0.001, 1)
                tpm[s,a] /= sum(tpm[s,a])
        return tpm
    
    def reward_distribution(self, r_min, r_max):
        return np.random.uniform(r_min, r_max, size=(self.n_s, self.n_a))
   
    def reset(self):
        self.state = np.random.randint(self.n_s)
        return self.state
        
    def step(self, action):
        s = self.state
        self.state = np.random.choice(self.state_space,p=self.p[s,action])
        self.reward = self.r[s]#, self.state]
        return self.state, self.reward
    
    def sample_action(self):
        return np.random.randint(n_a)

In [29]:
alpha = 0.95
n_s = 10
n_a = 8
M = MDP(n_s, n_a, alpha)

In [31]:
def value_iteration(MDP):
    policy = np.zeros((MDP.n_s)).astype(np.int32)
    V = np.zeros(MDP.n_s)
    k = 0
    while True:
        V_old = V.copy()
        for s in range(MDP.n_s):
            Q = {}
            for a in range(n_a):
                Q[a] = MDP.r[s,a] + MDP.alpha*sum(MDP.p[s,a,s_nxt]*V_old[s_nxt] for s_nxt in range(MDP.n_s))
            V[s] = max(Q.values())
            policy[s] = max(Q, key=Q.get)
        if all(abs(V.reshape(-1) - V_old.reshape(-1)) < 1e-6):
            break
    return policy, V

In [32]:
policy, value = value_iteration(M)

In [33]:
print(policy, value)

[4 2 6 7 3 6 2 7 3 1] [17.06173218 17.22459486 17.35509404 17.41379582 17.41108894 17.34941335
 17.09016537 17.30173692 17.38143396 17.11773053]


In [34]:
def policy_evaluation_iterative(MDP, policy):
    V = np.zeros(n_s)
    k = 0
    while True:
        V_old = V.copy()
        for s in range(n_s):
            V[s] = MDP.r[s,policy[s]] + MDP.alpha*sum(MDP.p[s,policy[s],s_nxt]*V_old[s_nxt] for s_nxt in range(MDP.n_s))
        if all(abs(V.reshape(-1) - V_old.reshape(-1)) < 1e-6):
            break
    return V

In [36]:
value_1 = policy_evaluation_iterative(M, policy)

In [37]:
print(value_1)

[17.06173218 17.22459486 17.35509404 17.41379582 17.41108894 17.34941335
 17.09016537 17.30173692 17.38143396 17.11773053]


In [42]:
def policy_evaluation_matrix(MDP, policy):
    P_pol = np.zeros((MDP.n_s,MDP.n_s))
    reward = np.zeros(MDP.n_s)
    for s in range(MDP.n_s):
        P_pol[s] = MDP.p[s,policy[s]]
        reward[s] = MDP.r[s,policy[s]]
    V = np.matmul(np.linalg.inv(np.eye(MDP.n_s) - MDP.alpha*P_pol), reward)
    return V

In [43]:
value_2 = policy_evaluation_matrix(M, policy)

In [44]:
print(value_2)

[17.0617507  17.22461338 17.35511256 17.41381434 17.41110746 17.34943187
 17.09018389 17.30175544 17.38145248 17.11774904]


In [None]:
def policy_iteration(MDP):
    pol = np.random.randint(MDP.n_a, size=MDP.n_s)
    V = policy_evaluation_matrix(MDP, pol)
    while True:
        V_old = V.copy()
        print(V)
        print(policy)
        for s in range(MDP.n_s):
            Q = {}
            for a in range(MDP.n_a):
                Q[a] = MDP.r[s,a] + MDP.alpha*sum(MDP.p[s,a,s_nxt]*V[s_nxt] for s_nxt in range(MDP.n_s))
            pol[s] = max(Q, key=Q.get)
        if all(abs(V.reshape(-1) - V_old.reshape(-1)) == 0):
            break
    return policy, V

In [None]:
policy, value = policy_iteration(M)

In [None]:
print(policy, value)