# Has not been debugged or optimized

In [1]:
import numpy as np
import random
#from tqdm.notebook import tqdm
import copy

In [2]:
class Environment(object):
    '''General RL environment'''

    def __init__(self):
        pass

    def reset(self):
        pass

    def advance(self, action):
        '''
        Moves one step in the environment.
        Args:
            action
        Returns:
            reward - double - reward
            newState - int - new state
            pContinue - 0/1 - flag for end of the episode
        '''
        return 0, 0, 0

def make_riverSwim(epLen=20, nState=5):
    '''
    Makes the benchmark RiverSwim MDP.
    Args:
        NULL - works for default implementation
    Returns:
        riverSwim - Tabular MDP environment '''
    nAction = 2
    R_true = {}
    P_true = {}
    states = {}
    for s in range(nState):
        states[(s)] = 0.0
        for a in range(nAction):
            R_true[s, a] = (0, 0)
            P_true[s, a] = np.zeros(nState)

    # Rewards
    R_true[0, 0] = (5/1000, 0)
    R_true[nState - 1, 1] = (1, 0)

    # Transitions
    for s in range(nState):
        P_true[s, 0][max(0, s-1)] = 1.

    for s in range(1, nState - 1):
        P_true[s, 1][min(nState - 1, s + 1)] = 0.3
        P_true[s, 1][s] = 0.6
        P_true[s, 1][max(0, s-1)] = 0.1

    P_true[0, 1][0] = 0.3
    P_true[0, 1][1] = 0.7
    P_true[nState - 1, 1][nState - 1] = 0.9
    P_true[nState - 1, 1][nState - 2] = 0.1

    riverSwim = TabularMDP(nState, nAction, epLen)
    riverSwim.R = R_true
    riverSwim.P = P_true
    riverSwim.states = states
    riverSwim.reset()

    return riverSwim

class TabularMDP(Environment):
    '''
    Tabular MDP
    R - dict by (s,a) - each R[s,a] = (meanReward, sdReward)
    P - dict by (s,a) - each P[s,a] = transition vector size S
    '''

    def __init__(self, nState, nAction, epLen):
        '''
        Initialize a tabular episodic MDP
        Args:
            nState  - int - number of states
            nAction - int - number of actions
            epLen   - int - episode length
        Returns:
            Environment object
        '''

        self.nState = nState
        self.nAction = nAction
        self.epLen = epLen

        self.timestep = 0
        self.state = 0

        # Now initialize R and P
        self.R = {}
        self.P = {}
        self.states = {}
        for state in range(nState):
            for action in range(nAction):
                self.R[state, action] = (1, 1)
                self.P[state, action] = np.ones(nState) / nState

    def reset(self):
        "Resets the Environment"
        self.timestep = 0
        self.state = 0

    def advance(self,action):
        '''
        Move one step in the environment
        Args:
        action - int - chosen action
        Returns:
        reward - double - reward
        newState - int - new state
        episodeEnd - 0/1 - flag for end of the episode
        '''
        if self.R[self.state, action][1] < 1e-9:
            # Hack for no noise
            reward = self.R[self.state, action][0]
        else:
            reward = np.random.normal(loc=self.R[self.state, action][0],
                                      scale=self.R[self.state, action][1])
        #print(self.state, action, self.P[self.state, action])
        newState = np.random.choice(self.nState, p=self.P[self.state, action])

        # Update the environment
        self.state = newState
        self.timestep += 1

        episodeEnd = 0
        if self.timestep == self.epLen:
            episodeEnd = 1
            #newState = None
            self.reset()

        return reward, newState, episodeEnd

    def argmax(self,b):
        #print(b)
        return np.random.choice(np.where(b == b.max())[0])

In [95]:
class deep_sea(Environment):
    '''
    Description:
        A deep sea environment, where a diver goes
        down and each time and she needs to make a
        decision to go left or right.
        environment terminates after fixed time step

    Observation:
        [horizontal position, vertical position]

    Actions:
        2 possible actions:
        0 - left
        1 - right

    Starting State:
        start at position 0, time step 0

    Episode termination:
        Env terminates after fixed number of time steps
    '''

    def __init__(self, num_steps):
        self.num_steps = num_steps
        self.epLen = num_steps
        self.flip_mask = 2*np.random.binomial(1,0.5,(num_steps,num_steps))-1
        self.nAction = 2
        self.nState = num_steps
        self.epLen = num_steps
        self.R = {}
        self.states = {}
        for s in range(self.nState+1):
            for s_ in range(self.nState+1):
                self.R[(s,s_), 0] = (0, 0)
                self.R[(s,s_), 1] = (-0.01/self.nState, 0)
                self.states[(s,s_)] = []
        self.R[(self.num_steps-1,self.num_steps-1),1] = (0.99,0)

    def name(self):
        return  "deep sea"

    def reset(self):
        self.state = (0,0)
        self.timestep = 0
        return copy.deepcopy(self.state)

    def advance(self,action):
        assert action in [0,1], "invalid action"
        self.state_prev = self.state
        step_horizontal = (2*action-1)
        horizontal = max(self.state[0] + step_horizontal, 0)
        vertical = self.state[1] + 1
        done =  bool(vertical == self.num_steps)
        self.state = (horizontal, vertical)
        self.timestep += 1
        return self.R[self.state_prev,action][0], copy.deepcopy(self.state), done
    
    def argmax(self,b):
        return np.random.choice(np.where(b == b.max())[0])

In [99]:
class UCBVI(object):
    def __init__(self,env,K):
        self.env = env
        self.K = K
        self.delta = 1/3
        self.buffer = {h: [] for h in range(self.env.epLen)}
        self.Nxay = {(s,a,s_): 0.0 for s in self.env.states.keys() for a in range(self.env.nAction) \
                     for s_ in self.env.states.keys()}
        self.Nxa = {(s,a): 0.0 for s in self.env.states.keys() for a in range(self.env.nAction)}
        self.N_ = {(h,s,a): 0.0 for h in range(self.env.epLen+1) for s in self.env.states.keys() \
                   for a in range(self.env.nAction)}
        self.P = {(s,a,s_): 0.0 for s in self.env.states.keys() for a in \
                  range(self.env.nAction) for s_ in self.env.states.keys()}
        self.Q = {(h,s,a): self.env.epLen+1 for h in range(self.env.epLen) for s in self.env.states.keys() \
                   for a in range(self.env.nAction)}
        
    
    def update_buffer(self,s,a,r,s_,h):
        self.buffer[h].append((s,a,r,s_,h))
    
    def act(self,s,h):
        x = np.array([self.Q[(h,s,a)] for a in range(self.env.nAction)])
        return self.env.argmax(x)
    
    def learn(self,k):
        self.update_counts(k)
        self.update_probability_transition()
        self.update_value_functions(k)
    
    def update_probability_transition(self):
        for s in self.env.states.keys():
            for a in range(self.env.nAction):
                if self.Nxa[(s,a)] > 0:
                    for s_ in self.env.states.keys():
                        self.P[(s,a,s_)] = (self.Nxay[(s,a,s_)]) / (self.Nxa[(s,a)])
    
    def update_counts(self,k):
        for d in self.buffer.values():
            #print(d)
            #print(k)
            #print(d[k][0])
            s,a,r,s_,h = d[k][0],d[k][1],d[k][2],d[k][3],d[k][4]
            if s_ != None:
                self.Nxay[(s,a,s_)] += 1
            self.Nxa[(s,a)] += 1
            self.N_[(h,s,a)] += 1
    
    def update_value_functions(self,k):
        V = {(h,s): 0.0 for s in self.env.states.keys() for h in range(env.epLen + 1)}
        for h in range(self.env.epLen-1,-1,-1):
            #print(h)
            for s in self.env.states.keys():
                for a in range(self.env.nAction):
                    if self.Nxa[(s,a)] > 0:
                        #bonus = self.bonus_1(s,a)
                        PV = self.multiplyDictionaries(s,a,h,V)
                        bonus = self.bonus_2(s,a,h,V)
                        self.Q[(h,s,a)] = min(min(self.Q[(h,s,a)], self.env.epLen), self.env.R[(s,a)][0] + PV + bonus)
                    else:
                        self.Q[(h,s,a)] = self.env.epLen
                V[(h,s)] = max(np.array([self.Q[(h,s,a)] for a in range(self.env.nAction)]))
                        
    
    def bonus_1(self,s,a):
        T = self.K * self.env.epLen
        L = np.log(5 * self.env.nState * self.env.nAction  * T / self.delta)
        return 7 * env.epLen * L * np.sqrt(1 / self.Nxa[(s,a)])
    
    def bonus_2(self,s,a,h,V):
        temp = []
        T = self.K * self.env.epLen
        L = np.log(5 * self.env.nState * self.env.nAction  * T / self.delta)
        for s_ in self.env.states.keys():
            c = V[(h+1,s_)] * self.P[(s,a,s_)]
            temp.append(c)
        var = np.var(temp)
        #print(var)
        first = np.sqrt(8*L*var/self.Nxa[(s,a)])
        second = 14*self.env.epLen*L/(3*self.Nxa[(s,a)])
        #third = np.sqrt(8*sums/self.Nxa[(s,a)])
        return first + second
    
    def multiplyDictionaries(self,s,a,h,V):
        sums = 0.0
        for s_ in self.env.states.keys():
            sums += V[(h+1,s_)] * self.P[(s,a,s_)]
        return sums

In [108]:
env = deep_sea(num_steps = 5)
K = 10000
agent = UCBVI(env,K)
reward = 0.0

In [109]:
for k in tqdm(range(K)):
    env.reset()
    done = 0
    while done != 1:
        s = env.state
        h = env.timestep
        a = agent.act(s,h)
        if k == K-1:
            print(a)
        r,s_,done = env.advance(a)
        reward += r
        if done != 1:
            agent.update_buffer(s,a,r,s_,h)
        else:
            agent.update_buffer(s,a,r,s_,h)
    agent.learn(k)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

1
1
1
1
1



In [110]:
agent.Q

{(0, (0, 0), 0): 1.5737043332815208,
 (0, (0, 0), 1): 1.574066540878821,
 (0, (0, 1), 0): 5,
 (0, (0, 1), 1): 5,
 (0, (0, 2), 0): 5,
 (0, (0, 2), 1): 5,
 (0, (0, 3), 0): 5,
 (0, (0, 3), 1): 5,
 (0, (0, 4), 0): 5,
 (0, (0, 4), 1): 5,
 (0, (0, 5), 0): 5,
 (0, (0, 5), 1): 5,
 (0, (1, 0), 0): 5,
 (0, (1, 0), 1): 5,
 (0, (1, 1), 0): 5,
 (0, (1, 1), 1): 5,
 (0, (1, 2), 0): 5,
 (0, (1, 2), 1): 5,
 (0, (1, 3), 0): 5,
 (0, (1, 3), 1): 5,
 (0, (1, 4), 0): 5,
 (0, (1, 4), 1): 5,
 (0, (1, 5), 0): 5,
 (0, (1, 5), 1): 5,
 (0, (2, 0), 0): 5,
 (0, (2, 0), 1): 5,
 (0, (2, 1), 0): 5,
 (0, (2, 1), 1): 5,
 (0, (2, 2), 0): 5,
 (0, (2, 2), 1): 5,
 (0, (2, 3), 0): 5,
 (0, (2, 3), 1): 5,
 (0, (2, 4), 0): 5,
 (0, (2, 4), 1): 5,
 (0, (2, 5), 0): 5,
 (0, (2, 5), 1): 5,
 (0, (3, 0), 0): 5,
 (0, (3, 0), 1): 5,
 (0, (3, 1), 0): 5,
 (0, (3, 1), 1): 5,
 (0, (3, 2), 0): 5,
 (0, (3, 2), 1): 5,
 (0, (3, 3), 0): 5,
 (0, (3, 3), 1): 5,
 (0, (3, 4), 0): 5,
 (0, (3, 4), 1): 5,
 (0, (3, 5), 0): 5,
 (0, (3, 5), 1): 5,
 (0, (4

In [111]:
reward

3120.434000000643