In [2]:
import sys
sys.path.append("../")
sys.path.append("/Users/abhinavrangarajan/opt/anaconda3/envs/SBALoan/lib/python3.7/site-packages")

In [3]:
import numpy as np
from rl.function_approx import FunctionApprox 
from dataclasses import dataclass
from typing import TypeVar, Iterable, Tuple, Optional
from __future__ import annotations
from collections import defaultdict

X = TypeVar('X')


## Redined Tabular Function Approx from Assignment 11

In [4]:

class TabularApprox(FunctionApprox):
    def __init__(self):
        self.count_dict = defaultdict(int)
        self.value_dict = defaultdict(int)
    
    def evaluate(self, x_values_seq: Iterable[X]) -> np.ndarray:
#         for x in x_values_seq:
#             print(x)
        return np.array([self.value_dict[x] for x in x_values_seq])
    
    def representational_gradient(self, x_value: X) -> TabularApprox[X]:
        pass
    
    def solve(
        self,
        xy_vals_seq: Iterable[Tuple[X, float]],
        error_tolerance: Optional[float] = None
    ) -> TabularApprox[X]:
        tmp = TabularApprox()
        tmp.update(xy_vals_seq=xy_vals_seq)
        return tmp
    
    def update(
        self,
        xy_vals_seq: Iterable[Tuple[X, float]]
    ) -> TabularApprox:
        for (x,y) in xy_vals_seq:
            self.count_dict[x] += 1
            self.value_dict[x] += (1 / self.count_dict[x]) * (y - self.value_dict[x])
        return self
    
    def within(self, other: FunctionApprox[X], tolerance: float) -> bool:
        if isinstance(other, TabularApprox):
            return np.all(
                (k in other) and (other.value_dict[k] == self.value_dict[k])
                for k in self.value_dict.keys()
            )

        return False


In [80]:
np.random.seed(10)
P = np.random.rand(10, 10)
P = P / P.sum(axis=1).reshape(-1, 1)
gamma = 0.5
R = np.random.rand(10)

true_V = np.linalg.inv(np.eye(10) - gamma*P) @ R


In [13]:
def generate_traces(num=100000):
    np.random.seed(10)
    return [generate_trace() for _ in range(num)]
    
def generate_trace(size=10):
    trace_states, trace_reward = [], []
    current_state = np.random.choice(10)
    for i in range(10):
        trace_states.append((current_state, i))
        trace_reward.append(R[current_state] + np.random.randn()*0.25) # add noise to reward
        current_state = np.random.choice(10, p=P[current_state])
    return trace_states, trace_reward


In [33]:
true_v_list = []
tmp = R
for _ in range(10):
    true_v_list.append(tmp)
    tmp = R + gamma * (P @ tmp)

true_V_mat = np.array(true_v_list[::-1]).T


In [67]:
true_V_mat


array([[1.01057113, 1.00963849, 1.00777319, 1.00404258, 0.9965811 ,
        0.98165582, 0.95182311, 0.89265211, 0.77981024, 0.57813643],
       [1.39150689, 1.39057424, 1.38870894, 1.38497834, 1.37751716,
        1.36259539, 1.33275284, 1.27288034, 1.14964525, 0.85393375],
       [0.54294864, 0.54201599, 0.54015069, 0.53642009, 0.52895891,
        0.51403672, 0.48420173, 0.42456753, 0.30642069, 0.06809727],
       [0.90657442, 0.90564177, 0.90377647, 0.90004588, 0.8925845 ,
        0.87766216, 0.84780184, 0.78843259, 0.6700074 , 0.46453081],
       [1.27475554, 1.27382289, 1.27195759, 1.26822701, 1.26076584,
        1.2458407 , 1.21596934, 1.15611203, 1.03758945, 0.78194912],
       [1.18643031, 1.18549766, 1.18363236, 1.17990178, 1.17244085,
        1.15751975, 1.12766665, 1.0678046 , 0.94832317, 0.71860281],
       [1.08397023, 1.08303758, 1.08117228, 1.07744168, 1.06998046,
        1.0550596 , 1.02522543, 0.96559716, 0.84361688, 0.58602198],
       [0.49850852, 0.49757587, 0.4957105

## Problem 1 (n=4)

In [54]:
tab2 = TabularApprox()
n=4

In [55]:
for trace_states, trace_rewards in generate_traces(100000):
    for i in range(10):
        s = trace_states[i]
        r = trace_rewards[i]
        
        edge = min(10, i+n)
        tmp_list = trace_rewards[i:edge] + [tab2.evaluate([trace_states[edge]])[0] if edge < 10 else 0.0]

        tmp = 0.0
        for el in reversed(tmp_list):
            tmp = gamma*tmp + el
        
        tab2.update([[s, tmp]])

In [56]:
for k, v in tab2.value_dict.items():
    if not np.isclose(true_V_mat[k], v, atol=0.01):
        print(f"{k} is very different: Truth={true_V_mat[k]}, Pred={v}")

## Problem 2 : Implementing TD($\lambda$)

In [68]:
def generate_traces_continuous(num=100000):
    np.random.seed(10)
    return [generate_trace_continuous() for _ in range(num)]
    
def generate_trace_continuous(size=10):
    trace_states, trace_reward = [], []
    current_state = np.random.choice(10)
    for i in range(10):
        trace_states.append(current_state)
        trace_reward.append(R[current_state] + np.random.randn()*0.25) # add noise to reward
        current_state = np.random.choice(10, p=P[current_state])
    return trace_states, trace_reward


In [87]:
tab3 = TabularApprox()
lam = 0.5


In [88]:
for trace_states, trace_rewards in generate_traces_continuous(100000):
    eligibility_trace = np.zeros(10)
    for i in range(9):
        s = trace_states[i]
        r = trace_rewards[i]
        
        eligibility_trace = lam*gamma*eligibility_trace + np.eye(10)[s]
        
        val_s = tab3.evaluate([s])[0]
        val_s_prime = tab3.evaluate([trace_states[i+1]])[0]
        delta = (r + gamma*val_s_prime - val_s)
        
        tab3.update([[s, val_s + delta*eligibility_trace[s]]])
        

In [93]:
[tab3.value_dict[k] for k in range(10)]

[1.0110680002984294,
 1.3908530921469693,
 0.5425206593496347,
 0.9078630076507419,
 1.275313499560883,
 1.1862447013462072,
 1.0831510529133261,
 0.49908588477255533,
 0.8279592698900636,
 1.0643666637421452]

In [95]:
list(true_V)

[1.0115037846645087,
 1.3924395401123104,
 0.5438812857303887,
 0.9075070708573852,
 1.2756881874566006,
 1.187362961060823,
 1.0849028838003623,
 0.49944116883669903,
 0.8314394588908189,
 1.0662563611339073]

In [96]:
np.array([tab3.value_dict[k] for k in range(10)]) - true_V

array([-0.00043578, -0.00158645, -0.00136063,  0.00035594, -0.00037469,
       -0.00111826, -0.00175183, -0.00035528, -0.00348019, -0.0018897 ])