In [8]:
import tinygrad.nn
import numpy as np
from tinygrad import nn , Tensor



def layer_init(layer: nn.Linear, std=np.sqrt(2), bias_const=0.0):
    """CleanRL's default layer initialization"""
    layer.weight = tiny_orthogonal_(layer.weight, std)
    layer.bias = tiny_constant_(layer.bias, bias_const)
    return layer

from tinygrad import nn 
def tiny_orthogonal_(tensor: Tensor, gain=1, generator=None):
    """
    NOTE: Since initialization occurs only once, we are being lazy and using numpy linear algebra to perform certain operations.
    """
    if tensor.ndim < 2:
        raise ValueError("Only tensors with 2 or more dimensions are supported")

    if tensor.numel() == 0:
        return tensor # no-op for empty tensors

    rows, cols = tensor.shape[0], tensor.numel() // tensor.shape[0]
    flattened = Tensor.randn(rows, cols) # figure out if it has the same device configs as the input tensor

    if rows < cols:
        flattened = flattened.transpose()

    # for now, we use numpy to compute the qr factorization
    q, r = np.linalg.qr(flattened.numpy())

    d = np.diag(r, 0)
    ph = np.sign(d)
    q *= ph

    if rows < cols:
        q.transpose()

    return Tensor(q).mul(gain)

def tiny_constant_(tensor: Tensor, val: float):
    """
    """
    return Tensor.ones(tensor.shape) * val
    

from tinygrad import nn 


class TinyPolicy:
    def __init__(self, policy):
        self.policy = policy
        self.is_continuous = hasattr(policy, 'is_continuous') and policy.is_continuous
    
    def __call__(self, x, action=None):
        return self.get_action_and_value(x, action)

    def get_value(self, x, state=None):
        _, value = self.policy(x)
        return value

    def get_action_and_value(self, x, action=None):
        logits, value = self.policy(x)
        action, logprob, entropy = sample_logits(logits, action, self.is_continuous)
        return action, logprob, entropy, value
    
class Critic:
    def __init__(self, obs_size, hidden_size):
        self.l1 = layer_init(tinygrad.nn.Linear(obs_size, hidden_size))
        self.l2 = layer_init(tinygrad.nn.Linear(hidden_size, hidden_size))
        self.l3 = layer_init(tinygrad.nn.Linear(hidden_size, 1))

    def __call__(self, x: Tensor):
        x = self.l1(x).tanh()
        x = self.l2(x).tanh()
        return self.l3(x)

class ActorEncoder:
    def __init__(self, obs_size, hidden_size):
        self.l1 = layer_init(tinygrad.nn.Linear(obs_size, hidden_size))
        self.l2 = layer_init(tinygrad.nn.Linear(hidden_size, hidden_size))

    def __call__(self, x: Tensor):
        x = self.l1(x).tanh()
        return self.l2(x).tanh()

class TinyCleanRLPolicy(TinyPolicy):
    def __init__(self, envs, hidden_size=64):
        super().__init__(policy=None)  # Just to get the right init
        self.is_continuous = True

        # self.obs_size = np.array(envs.single_observation_space.shape).prod()
        # action_size = np.prod(envs.single_action_space.shape)

        ## figuring out how to normalize observations will be an important step, but leaving it out for now
        action_size = 1
        self.obs_size = 1
        self.critic = Critic(self.obs_size, hidden_size)
        self.actor_encoder = ActorEncoder(self.obs_size, hidden_size)
        self.actor_decoder_mean = layer_init(tinygrad.nn.Linear(hidden_size, action_size), std=0.01)
        self.actor_decoder_logstd = Tensor.zeros(1, action_size)



policy = TinyCleanRLPolicy("hypothetical env", hidden_size=64)



In [19]:
from torch.distributions import Normal
import torch
a = Normal(torch.tensor([1.0, 2.0]), torch.tensor([1.0, 1.0]))

