In [52]:
import sys
sys.path.append("../")
sys.path.append("/Users/abhinavrangarajan/opt/anaconda3/envs/SBALoan/lib/python3.7/site-packages")

## Problem 1 : Monte-Carlo prediction w/ Tabular Approx

In [179]:
import numpy as np
from rl.function_approx import FunctionApprox 
from dataclasses import dataclass
from typing import TypeVar, Iterable, Tuple, Optional
from __future__ import annotations
from collections import defaultdict

X = TypeVar('X')


### Define Custom Tabular MC-Prediction

In [304]:

class TabularApprox(FunctionApprox):
    def __init__(self):
        self.count_dict = defaultdict(int)
        self.value_dict = defaultdict(int)
    
    def evaluate(self, x_values_seq: Iterable[X]) -> np.ndarray:
#         for x in x_values_seq:
#             print(x)
        return np.array([self.value_dict[x] for x in x_values_seq])
    
    def representational_gradient(self, x_value: X) -> TabularApprox[X]:
        pass
    
    def solve(
        self,
        xy_vals_seq: Iterable[Tuple[X, float]],
        error_tolerance: Optional[float] = None
    ) -> TabularApprox[X]:
        tmp = TabularApprox()
        tmp.update(xy_vals_seq=xy_vals_seq)
        return tmp
    
    def update(
        self,
        xy_vals_seq: Iterable[Tuple[X, float]]
    ) -> TabularApprox:
        for (x,y) in xy_vals_seq:
            self.count_dict[x] += 1
            self.value_dict[x] += (1 / self.count_dict[x]) * (y - self.value_dict[x])
        return self
    
    def within(self, other: FunctionApprox[X], tolerance: float) -> bool:
        if isinstance(other, TabularApprox):
            return np.all(
                (k in other) and (other.value_dict[k] == self.value_dict[k])
                for k in self.value_dict.keys()
            )

        return False

In [181]:
from functools import reduce

# define returns mechanism
def fetch_returns(trace, gamma=1.0):
    def helper(array):
        running = 0
        for val in np.flip(array):
            running = gamma*running + val
            yield running
            
    return np.array(list(helper(trace))[::-1])

In [182]:
fetch_returns([1,2,3], gamma=0.9)

array([5.23, 4.7 , 3.  ])

### Define custom MRP to evaluate value function

In [183]:
np.random.seed(10)
P = np.random.rand(10, 10)
P = P / P.sum(axis=1).reshape(-1, 1)
gamma = 0.5
R = np.random.rand(10)

true_V = np.linalg.inv(np.eye(10) - gamma*P) @ R

In [184]:
true_v_list = []
tmp = R
for _ in range(10):
    true_v_list.append(tmp)
    tmp = R + gamma * (P @ tmp)

true_V_mat = np.array(true_v_list[::-1]).T

In [185]:
true_V_mat

array([[1.01057113, 1.00963849, 1.00777319, 1.00404258, 0.9965811 ,
        0.98165582, 0.95182311, 0.89265211, 0.77981024, 0.57813643],
       [1.39150689, 1.39057424, 1.38870894, 1.38497834, 1.37751716,
        1.36259539, 1.33275284, 1.27288034, 1.14964525, 0.85393375],
       [0.54294864, 0.54201599, 0.54015069, 0.53642009, 0.52895891,
        0.51403672, 0.48420173, 0.42456753, 0.30642069, 0.06809727],
       [0.90657442, 0.90564177, 0.90377647, 0.90004588, 0.8925845 ,
        0.87766216, 0.84780184, 0.78843259, 0.6700074 , 0.46453081],
       [1.27475554, 1.27382289, 1.27195759, 1.26822701, 1.26076584,
        1.2458407 , 1.21596934, 1.15611203, 1.03758945, 0.78194912],
       [1.18643031, 1.18549766, 1.18363236, 1.17990178, 1.17244085,
        1.15751975, 1.12766665, 1.0678046 , 0.94832317, 0.71860281],
       [1.08397023, 1.08303758, 1.08117228, 1.07744168, 1.06998046,
        1.0550596 , 1.02522543, 0.96559716, 0.84361688, 0.58602198],
       [0.49850852, 0.49757587, 0.4957105

In [186]:
true_V

array([1.01150378, 1.39243954, 0.54388129, 0.90750707, 1.27568819,
       1.18736296, 1.08490288, 0.49944117, 0.83143946, 1.06625636])

In [262]:
def generate_traces(num=100000):
    np.random.seed(10)
    return [generate_trace() for _ in range(num)]
    
def generate_trace(size=10):
    trace_states, trace_reward = [], []
    current_state = np.random.choice(10)
    for i in range(10):
        trace_states.append((current_state, i))
        trace_reward.append(R[current_state] + np.random.randn()*0.25) # add noise to reward
        current_state = np.random.choice(10, p=P[current_state])
    return trace_states, trace_reward

### Compare Function Approx to Real Values 

In [279]:
tab = TabularApprox()

In [280]:
for trace_states, trace_rewards in generate_traces():
    mc_reward = fetch_returns(trace_rewards, gamma=gamma)
    for (s, g) in zip(trace_states, mc_reward):
        tab.update([[s, g]])

In [281]:
for k, v in tab.value_dict.items():
    if not np.isclose(true_V_mat[k], v, atol=0.01):
        print(f"{k} is very different")

#### ^ As we can see above ^, the values are close to the truth within 0.01 error

## Problem 2: Temporal Difference with Tabular Function Approx.

In [319]:
tab2 = TabularApprox()

In [320]:
for trace_states, trace_rewards in generate_traces(100000):
    for i in range(10):
        s = trace_states[i]
        r = trace_rewards[i]
        v_prime = tab2.evaluate([trace_states[i+1]])[0] if (i+1 < 10) else 0.0
        
        tab2.update([[s, r + gamma*v_prime]])

In [321]:
for k, v in tab2.value_dict.items():
    if not np.isclose(true_V_mat[k], v, atol=0.1):
        print(f"{k} is very different")

#### ^ As we can see above ^, the values are close to the truth within 0.01 error