In [1]:
import numpy as np
import random
from tqdm.notebook import tqdm
import copy

In [2]:
class Environment(object):
    '''General RL environment'''

    def __init__(self):
        pass

    def reset(self):
        pass

    def advance(self, action):
        '''
        Moves one step in the environment.
        Args:
            action
        Returns:
            reward - double - reward
            newState - int - new state
            pContinue - 0/1 - flag for end of the episode
        '''
        return 0, 0, 0

In [19]:
def make_riverSwim(epLen=20, nState=5):
    '''
    Makes the benchmark RiverSwim MDP.
    Args:
        NULL - works for default implementation
    Returns:
        riverSwim - Tabular MDP environment '''
    nAction = 2
    R_true = {}
    P_true = {}

    for s in range(nState):
        for a in range(nAction):
            R_true[s, a] = (0, 0)
            P_true[s, a] = np.zeros(nState)

    # Rewards
    R_true[0, 0] = (5 / 100, 0)
    R_true[nState - 1, 1] = (1, 0)

    # Transitions
    for s in range(nState):
        P_true[s, 0][max(0, s-1)] = 1.

    for s in range(1, nState - 1):
        P_true[s, 1][min(nState - 1, s + 1)] = 0.3
        P_true[s, 1][s] = 0.6
        P_true[s, 1][max(0, s-1)] = 0.1

    P_true[0, 1][0] = 0.3
    P_true[0, 1][1] = 0.7
    P_true[nState - 1, 1][nState - 1] = 0.9
    P_true[nState - 1, 1][nState - 2] = 0.1

    riverSwim = TabularMDP(nState, nAction, epLen)
    riverSwim.R = R_true
    riverSwim.P = P_true
    riverSwim.reset()

    return riverSwim

In [20]:
class TabularMDP(Environment):
    '''
    Tabular MDP
    R - dict by (s,a) - each R[s,a] = (meanReward, sdReward)
    P - dict by (s,a) - each P[s,a] = transition vector size S
    '''

    def __init__(self, nState, nAction, epLen):
        '''
        Initialize a tabular episodic MDP
        Args:
            nState  - int - number of states
            nAction - int - number of actions
            epLen   - int - episode length
        Returns:
            Environment object
        '''

        self.nState = nState
        self.nAction = nAction
        self.epLen = epLen

        self.timestep = 0
        self.state = 0

        # Now initialize R and P
        self.R = {}
        self.P = {}
        for state in range(nState):
            for action in range(nAction):
                self.R[state, action] = (1, 1)
                self.P[state, action] = np.ones(nState) / nState
                
    def reset(self):
        "Resets the Environment"
        self.timestep = 0
        self.state = 0
        
    def advance(self,action):
        '''
        Move one step in the environment
        Args:
        action - int - chosen action
        Returns:
        reward - double - reward
        newState - int - new state
        pContinue - 0/1 - flag for end of the episode
        '''
        if self.R[self.state, action][1] < 1e-9:
            # Hack for no noise
            reward = self.R[self.state, action][0]
        else:
            reward = np.random.normal(loc=self.R[self.state, action][0],
                                      scale=self.R[self.state, action][1])
        #print(self.state, action, self.P[self.state, action])
        newState = np.random.choice(self.nState, p=self.P[self.state, action])
        
        # Update the environment
        self.state = newState
        self.timestep += 1

        if self.timestep == self.epLen:
            pContinue = 1
            #newState = None
            self.reset()
        else:
            pContinue = 0

        return reward, newState, pContinue
    
    def argmax(self,b):
        return np.random.choice(np.where(b == b.max())[0])

In [21]:
class PSRL(object):
    def __init__(self,env):
        self.env = env
        self.alpha = {key: np.zeros(env.nState) for key in env.R.keys()} # need a more efficent method of creating this
        self.mu = {key: 0.0 for key in env.R.keys()}
        self.sigma2 = {key: 1.0 for key in env.R.keys()} #also this can be higher, no idea how initialization of this matrix effects learning, I assume higher initial values "slows" learning
        self.sigma_r = 0.00001 #since our reward is deterministic, this keeps from the estimate of the reward from centering around the mean reward can this value be learned?
        self.buffer = {h: [] for h in range(env.epLen)}
        self.P = {key: np.zeros(env.nState) for key in env.R.keys()}
        self.R = {key: 0.0 for key in env.R.keys()}
        self.Rbar = {key: 0.0 for key in env.R.keys()}
        self.Q = {key: 0.0 for key in env.R.keys()}
    
    def act(self,s):
        x = np.array([self.Q[(s,a)] for a in range(self.env.nAction)])
        return env.argmax(x)
    
    def learn(self,l):
        self.update_statistics(l)
        self.update_priors()
        self.update_value_functions()
    
    def update_buffer(self,s,a,r,s_,t):
        self.buffer[t].append((s,a,r,s_))
    
    def update_statistics(self,l):
        for d in self.buffer.values():
            #print(d)
            s,a,r,s_ = d[l][0],d[l][1],d[l][2],d[l][3]
            if s_ != None:
                self.alpha[(s,a)][s_] = self.alpha[(s,a)][s_] + 1
            self.mu[(s,a)] = (1/self.sigma2[(s,a)]*self.mu[(s,a)] + 1/self.sigma_r*r)/(1/self.sigma2[(s,a)] + 1/self.sigma_r)
            self.sigma2[(s,a)] = 1 / (1/self.sigma2[(s,a)] + 1/self.sigma_r)
    
    def update_priors(self):
        for s in range(env.nState):
            for a in range(env.nAction):
                self.Rbar[(s,a)] = np.random.normal(self.mu[(s,a)],self.sigma2[(s,a)])
                self.R[(s,a)] = np.random.normal(self.Rbar[s,a],self.sigma_r)
                self.P[(s,a)] = np.random.dirichlet(self.alpha[(s,a)] + 0.004) #since numpy's dirichlet doesn't take zeros add a small numerical value for stability

    def update_value_functions(self):
        Q = {key: 0.0 for key in env.R.keys()}
        V = np.zeros(env.nState) #need to make this a dictionary
        for _ in range(env.epLen+10): #why not? Seems to improve 'stability' to convergence of 'optimal' Q-values
            for s in range(env.nState): 
                for a in range(env.nAction):
                    w = np.random.normal(0,(pow(env.epLen+1,2))/(max(sum(self.alpha[s,a])-2,1)))
                    Q[(s,a)] = self.R[(s,a)] + np.inner(self.P[(s,a)],V) + w
                V[s] = max([Q[(s,a_)] for a_ in range(env.nAction)])
        self.Q = Q.copy()
        self.V = V

In [22]:
env = make_riverSwim(epLen = 20, nState = 5)
agent = PSRL(env)
for l in tqdm(range(1000)):
    #env.reset()
    done = 0
    while done != 1:
        s = env.state
        a = agent.act(s)
        t = env.timestep
        r,s_,done = env.advance(a)
        if done != 1:
            agent.update_buffer(s,a,r,s_,t)
        else:
            agent.update_buffer(s,a,r,None,t)
    agent.learn(l)
        

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [24]:
agent.P

{(0,
  0): array([1.00000000e+000, 2.12109355e-027, 1.09353152e-084, 0.00000000e+000,
        8.94882877e-214]),
 (0, 1): array([3.17892148e-01, 6.82107852e-01, 3.93627235e-89, 8.71410467e-36,
        2.06526630e-33]),
 (1,
  0): array([1.00000000e+000, 2.25341636e-063, 9.37256278e-113, 7.17176768e-135,
        2.63427800e-256]),
 (1, 1): array([1.23222287e-01, 5.71101634e-01, 3.05676079e-01, 1.54622397e-36,
        4.94662910e-74]),
 (2,
  0): array([1.21624958e-159, 1.00000000e+000, 3.83656776e-102, 1.11355439e-039,
        2.13240020e-093]),
 (2,
  1): array([0.00000000e+000, 9.63577666e-002, 5.78189437e-001, 3.25452796e-001,
        5.60402689e-101]),
 (3,
  0): array([1.48056261e-239, 1.62592634e-249, 1.00000000e+000, 1.52680536e-081,
        7.94567871e-142]),
 (3, 1): array([6.82396734e-20, 1.43814186e-95, 9.95105924e-02, 6.03608659e-01,
        2.96880749e-01]),
 (4,
  0): array([6.83079699e-158, 2.03377595e-172, 1.84367720e-019, 1.00000000e+000,
        1.54686903e-097]),
 (4,

In [27]:
arr = []
for key in agent.P.keys():
    arr.append(agent.P[key])

In [30]:
arr = np.array(arr)

[[1.00000000e+000 2.12109355e-027 1.09353152e-084 0.00000000e+000
  8.94882877e-214]
 [3.17892148e-001 6.82107852e-001 3.93627235e-089 8.71410467e-036
  2.06526630e-033]
 [1.00000000e+000 2.25341636e-063 9.37256278e-113 7.17176768e-135
  2.63427800e-256]
 [1.23222287e-001 5.71101634e-001 3.05676079e-001 1.54622397e-036
  4.94662910e-074]
 [1.21624958e-159 1.00000000e+000 3.83656776e-102 1.11355439e-039
  2.13240020e-093]
 [0.00000000e+000 9.63577666e-002 5.78189437e-001 3.25452796e-001
  5.60402689e-101]
 [1.48056261e-239 1.62592634e-249 1.00000000e+000 1.52680536e-081
  7.94567871e-142]
 [6.82396734e-020 1.43814186e-095 9.95105924e-002 6.03608659e-001
  2.96880749e-001]
 [6.83079699e-158 2.03377595e-172 1.84367720e-019 1.00000000e+000
  1.54686903e-097]
 [7.61555243e-047 1.11939142e-012 2.15284901e-056 8.66955578e-002
  9.13304442e-001]]
