In [11]:
import numpy as np

class Test:
    def __init__(self):
        self.gamma = 0.5
    def true_discounted_return(self, rewards):
        """
            Helper function

            Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
        """
        T = len(rewards)
        gamma = self.gamma**np.arange(0, T)
        discounted_return = gamma * rewards
        sum_discounted = np.sum(discounted_return)
        return np.repeat(sum_discounted, T)

    def true_discounted_cumsum(self, rewards):
        """
            Helper function which
            -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
            -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        """
        T = len(rewards)
        list_of_discounted_cumsums = [rewards[-1]]
        for t in range(1, T):
            list_of_discounted_cumsums.append(rewards[T-t-1] + self.gamma*list_of_discounted_cumsums[t-1])

        return list_of_discounted_cumsums[::-1]

    def _discounted_return(self, rewards):
        """
            Helper function

            Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
        """
        T = len(rewards)
        rewards = np.asarray(rewards)
        total_sum = np.sum(rewards)
        discounts = np.array([ self.gamma ** i for i in range(T) ])
        total_sum = np.sum(discounts * rewards)
        return np.repeat(total_sum, T)

    def _discounted_cumsum(self, rewards):
        """
            Helper function which
            -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
            -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        """
        T = len(rewards)
        rewards = np.asarray(rewards)
        list_of_discounted_cumsums = np.zeros(T, dtype=np.float32)
        for t in range(T):
            len_of_sum = T - t
            discounts = np.array([ self.gamma ** i for i in range(len_of_sum) ])
            curr_rew = rewards[t:]
            list_of_discounted_cumsums[t] = np.dot(discounts, curr_rew)
            # print(list_of_discounted_cumsums[t].dtype)
        # print("List of disc ret:", list_of_discounted_cumsums.dtype, list_of_discounted_cumsums[0].dtype)
        return list_of_discounted_cumsums

In [12]:
test = Test()

In [13]:
rewards = np.array([4, 5, 10, 3, 2, 1, 0, 8, 9])
# test regular sum
print(test.true_discounted_return(rewards))
print(test._discounted_return(rewards))


[9.62890625 9.62890625 9.62890625 9.62890625 9.62890625 9.62890625
 9.62890625 9.62890625 9.62890625]
[9.62890625 9.62890625 9.62890625 9.62890625 9.62890625 9.62890625
 9.62890625 9.62890625 9.62890625]


In [15]:
print(test.true_discounted_cumsum(rewards) == test._discounted_cumsum(rewards))

[ True  True  True  True  True  True  True  True  True]
