In [18]:
import numpy as np
import random
import copy
from scipy.stats import bernoulli
#from tqdm.notebook import tqdm

In [19]:
class Environment(object):
    '''General RL environment'''

    def __init__(self):
        pass

    def reset(self):
        pass

    def advance(self, action):
        '''
        Moves one step in the environment.
        Args:
            action
        Returns:
            reward - double - reward
            newState - int - new state
            pContinue - 0/1 - flag for end of the episode
        '''
        return 0, 0, 0

In [20]:
def make_riverSwim(epLen=20, nState=5):
    '''
    Makes the benchmark RiverSwim MDP.
    Args:
        NULL - works for default implementation
    Returns:
        riverSwim - Tabular MDP environment '''
    nAction = 2
    R_true = {}
    P_true = {}

    for s in range(nState):
        for a in range(nAction):
            R_true[s, a] = (0, 0)
            P_true[s, a] = np.zeros(nState)

    # Rewards
    R_true[0, 0] = (5/1000, 0)
    R_true[nState - 1, 1] = (1, 0)

    # Transitions
    for s in range(nState):
        P_true[s, 0][max(0, s-1)] = 1.

    for s in range(1, nState - 1):
        P_true[s, 1][min(nState - 1, s + 1)] = 0.3
        P_true[s, 1][s] = 0.6
        P_true[s, 1][max(0, s-1)] = 0.1

    P_true[0, 1][0] = 0.3
    P_true[0, 1][1] = 0.7
    P_true[nState - 1, 1][nState - 1] = 0.9
    P_true[nState - 1, 1][nState - 2] = 0.1

    riverSwim = TabularMDP(nState, nAction, epLen)
    riverSwim.R = R_true
    riverSwim.P = P_true
    riverSwim.reset()

    return riverSwim

class TabularMDP(Environment):
    '''
    Tabular MDP
    R - dict by (s,a) - each R[s,a] = (meanReward, sdReward)
    P - dict by (s,a) - each P[s,a] = transition vector size S
    '''

    def __init__(self, nState, nAction, epLen):
        '''
        Initialize a tabular episodic MDP
        Args:
            nState  - int - number of states
            nAction - int - number of actions
            epLen   - int - episode length
        Returns:
            Environment object
        '''

        self.nState = nState
        self.nAction = nAction
        self.epLen = epLen

        self.timestep = 0
        self.state = 0

        # Now initialize R and P
        self.R = {}
        self.P = {}
        for state in range(nState):
            for action in range(nAction):
                self.R[state, action] = (1, 1)
                self.P[state, action] = np.ones(nState) / nState
                
    def reset(self):
        "Resets the Environment"
        self.timestep = 0
        self.state = 0
        
    def advance(self,action):
        '''
        Move one step in the environment
        Args:
        action - int - chosen action
        Returns:
        reward - double - reward
        newState - int - new state
        episodeEnd - 0/1 - flag for end of the episode
        '''
        if self.R[self.state, action][1] < 1e-9:
            # Hack for no noise
            reward = self.R[self.state, action][0]
        else:
            reward = np.random.normal(loc=self.R[self.state, action][0],
                                      scale=self.R[self.state, action][1])
        #print(self.state, action, self.P[self.state, action])
        newState = np.random.choice(self.nState, p=self.P[self.state, action])
        
        # Update the environment
        self.state = newState
        self.timestep += 1

        episodeEnd = 0
        if self.timestep == self.epLen:
            episodeEnd = 1
            #newState = None
            self.reset()

        return reward, newState, episodeEnd
    
    def argmax(self,b):
        #print(b)
        return np.random.choice(np.where(b == b.max())[0])

In [21]:
def proj(x, lo, hi):
    '''Projects the value of x into the [lo,hi] interval'''
    return max(min(x,hi),lo)

In [22]:
class UCRL_VTR(object):
    '''
    Algorithm 1 as described in the paper Model-Based RL with
    Value-Target Regression
    The algorithm assumes that the rewards are in the [0,1] interval.
    '''
    def __init__(self,env,K):
        self.env = env
        self.K = K
        # Here the dimension (self.d) for the Tabular setting is |S x A x S| as stated in Appendix B
        self.d = env.nState * env.nAction * env.nState 
        # In the tabular setting the basis models is just the dxd identity matrix, see Appendix B
        self.P_basis = np.identity(self.d)
        #Our Q-values are initialized as a 2d numpy array, will eventually convert to a dictionary
        self.Q = [np.zeros((env.nState,env.nAction)) for i in range(env.epLen)]
        #Our State Value function is initialized as a 1d numpy error, will eventually convert to a dictionary
        self.V = [np.zeros(env.nState) for i in range(env.epLen+1)] # self.V[env.epLen] stays zero
        #The index of each (s,a,s') tuple, see Appendix B
        self.sigma = {}
        self.createSigma()
        #See Step 2, of algorithm 1
#         self.M = env.epLen**2*self.d*np.identity(self.d)
        self.lam = 1.0
        self.L = 1.0
        self.M = np.identity(self.d)*self.lam
        #See Step 2
        self.w = np.zeros(self.d)
        #See Step 2
        self.theta = np.matmul(np.linalg.inv(self.M),self.w)
        #See Step 3
        self.delta = 1/self.K
        #C_theta >= the 2-norm of theta_star, see Assumption 1
        self.C_theta = 3.0
#         #Initialize the predicted value of the basis models, see equation 3
#         self.X = np.zeros((env.epLen,self.d))

    def feature_vector(self,s,a,h):
        '''
        Returning sum_{s'} V[h+1][s'] P_dot(s'|s,a),
        with V stored in self.
        Inputs:
            s - the state
            a - the action
            h - the current timestep within the episode
        '''
        sums = np.zeros(self.d)
        for ss in range(env.nState):
            sums += self.V[h+1][ss] * self.P_basis[self.sigma[(s,a,ss)]]
        return sums
            
    def update_Q(self,s,a,k,h):
        '''
        A function that updates both Q and V, Q is updated according to equation 4 and 
        V is updated according to equation 2
        Inputs:
            s - the state
            a - the action
            k - the current episode
            h - the current timestep within the episode
        Currently, does not properly compute the Q-values but it does seem to learn theta_star
        '''
        #Here env.R[(s,a)][0] is the true reward from the environment
        # Alex's code: X = self.X[h,:] 
        # Suggested code:
        X = self.feature_vector(s,a,h)
        self.Q[h][s,a] = proj(env.R[(s,a)][0] + np.dot(X,self.theta) + self.Beta(k) \
            * np.sqrt(np.dot(np.dot(np.transpose(X),np.linalg.inv(self.M)),X)), 0, env.epLen )
        self.V[h][s] = max(self.Q[h][s,:])
    
    def update_Qend(self,k):
        '''
        A function that updates both Q and V at the end of each episode, see step 16 of algorithm 1
        Inputs:
            k - the current episode
        '''
        #step 16
        for h in range(env.epLen-1,-1,-1):
            for s in range(env.nState):
                for a in range(env.nAction):
                    #Here env.R[(s,a)][0] is the true reward from the environment
                    # Alex's code: X = self.X[h,:] 
                    # Suggested code:
                    self.update_Q(s,a,k,h)
                self.V[h][s] = max(self.Q[h][s,:])
    
    def update_stat(self,s,a,s_,h):
        '''
        A function that performs steps 9-13 of algorithm 1
        Inputs:
            s - the current state
            a - the action
            s_ - the next state
            k - the current episode
            h - the timestep within episode when s was visited (starting at zero)
        '''
        #Step 10
#         self.X[h,:] = self.feature_vector(s,a,h) # do not need to store this
        X = self.feature_vector(s,a,h)
        #Step 11
        y = self.V[h+1][s_]
#         if s_ != None:
#             y = self.V[h+1][s_]
#         else:
#             y = 0.0
        #Step 12
        self.M = self.M + np.outer(X,X)
        #Step 13
        self.w = self.w + y*X
    
    def update_param(self):
        '''
        Updates our approximation of theta_star at the end of each episode, see 
        Step 15 of algorithm1
        '''
        #Step 15
        #print(self.M)
        self.theta = np.matmul(np.linalg.inv(self.M),self.w)
        
    def act(self,s,h):
        '''
        Returns the greedy action with respect to Q_{h,k}(s,a) for a \in A
        see step 8 of algorithm 1
        Inputs:
            s - the current state
            h - the current timestep within the episode
        '''
        #step 8
        return env.argmax(self.Q[h][s,:])
        # return bernoulli.rvs(0.9) #A random policy for testing
        
    def createSigma(self):
        '''
        A simple function that creates sigma according to Appendix B.
        Here sigma is a dictionary who inputs is a tuple (s,a,s') and stores
        the interger index to be used in our basis model P.
        '''
        i = 0
        for s in range(env.nState):
            for a in range(env.nAction):
                for s_ in range(env.nState):
                    self.sigma[(s,a,s_)] = int(i)
                    i += 1
    
    def Beta(self,k):
        '''
        A function that return Beta_k according to Algorithm 1, step 3
        '''
        #Step 3
        #Bonus as according to step 3
        #return 16*pow(self.C_theta,2)*pow(env.epLen,2)*self.d*np.log(1+env.epLen*k) \
        #    *np.log(pow(k+1,2)*env.epLen/self.delta)*np.log(pow(k+1,2)*env.epLen/self.delta)
        
        #Confidence bound from Chapter 19/20 of the Bandit Algorithm book
        first = np.sqrt(self.lam)*self.L
        second = np.sqrt(2*np.log(1/self.delta) + self.d*np.log((self.d*self.lam + k*self.L*self.L)/(self.d*self.lam)))
        return first + second
        

In [23]:
env = make_riverSwim(epLen = 20, nState = 4)
K = 100
agent = UCRL_VTR(env,K)
count = np.zeros((env.nState,env.nState))
R = 0
for k in range(1,K+1):
    env.reset()
    done = 0
    while done != 1:
        s = env.state
        h = env.timestep
        a = agent.act(s,h)
        r,s_,done = env.advance(a)
        R += r
        count[s,s_] += 1
        agent.update_stat(s,a,s_,h)
    agent.update_param()
    agent.update_Qend(k)

In [24]:
true_p = []
for values in env.P.values():
    for value in values:
        true_p.append(value)
print('The 2-norm of (P_true - theta_star) is:',np.linalg.norm(true_p-agent.theta))
#When epLen = 20, k = 100, nState = 4, the cumlative reward of the old bonus was ~12, 
#with the bonus from the bandit algorithm book it s ~460, the optimal policy cumlative reward 
#with these parameters is ~800.
print('The total reward is:', R)


The 2-norm of (P_true - theta_star) is: 1.2643956538169365
The total reward is 527.2599999999999


In [12]:
agent.theta

array([ 9.84652486e-01,  1.53622165e-02, -4.48324771e-04,  4.05006054e-04,
        4.28276410e-01,  5.69635994e-01,  9.61941613e-04,  2.99294562e-03,
        9.69299478e-01,  3.42594882e-02, -6.36210085e-03,  2.65715458e-03,
        3.16429077e-02,  7.35174128e-01,  2.24427534e-01,  1.04796513e-02,
        1.08320004e-01,  8.79548571e-01,  1.76607034e-02, -5.53039191e-03,
       -5.09963078e-01,  9.46590149e-01,  2.01348130e-01,  3.91325049e-01,
        3.48400943e-03,  2.99804743e-04,  9.80665045e-01,  1.49720128e-02,
        2.82735960e-01, -4.13678370e-01,  1.66713253e-01,  9.31155912e-01])

In [39]:
for z in agent.Q:
    print(z)

[[18.82404356 18.95583903]
 [19.23185577 19.51060409]
 [20.         20.        ]
 [20.         20.        ]]
[[17.70953919 17.69966203]
 [18.17218124 18.55788825]
 [20.         20.        ]
 [20.         20.        ]]
[[16.32746496 16.3141904 ]
 [16.89001908 17.19689512]
 [18.93947808 19.66207162]
 [20.         20.        ]]
[[15.05366465 15.04519871]
 [15.663916   15.82523209]
 [17.41453721 18.58130794]
 [20.         20.        ]]
[[13.80137454 13.85467144]
 [14.36223827 14.38483951]
 [15.73017725 17.20693048]
 [19.25613745 20.        ]]
[[12.46116308 12.5132217 ]
 [12.97833483 12.99998327]
 [14.23568677 15.6096709 ]
 [17.47109707 19.19369111]]
[[11.20865884 11.26136764]
 [11.68871872 11.71330642]
 [12.85499012 14.13327182]
 [15.82227312 17.48482788]]
[[10.03395217 10.08689661]
 [10.48290682 10.51313818]
 [11.57502475 12.77067551]
 [14.30847827 15.90997454]]
[[ 8.92496248  8.97835538]
 [ 9.34716152  9.38281959]
 [10.3741947  11.50747884]
 [12.91341198 14.46677476]]
[[ 7.87340233  7.92