In [8]:
import tinygrad.nn
import numpy as np
from tinygrad import nn , Tensor



def layer_init(layer: nn.Linear, std=np.sqrt(2), bias_const=0.0):
    """CleanRL's default layer initialization"""
    layer.weight = tiny_orthogonal_(layer.weight, std)
    layer.bias = tiny_constant_(layer.bias, bias_const)
    return layer

from tinygrad import nn 
def tiny_orthogonal_(tensor: Tensor, gain=1, generator=None):
    """
    NOTE: Since initialization occurs only once, we are being lazy and using numpy linear algebra to perform certain operations.
    """
    if tensor.ndim < 2:
        raise ValueError("Only tensors with 2 or more dimensions are supported")

    if tensor.numel() == 0:
        return tensor # no-op for empty tensors

    rows, cols = tensor.shape[0], tensor.numel() // tensor.shape[0]
    flattened = Tensor.randn(rows, cols) # figure out if it has the same device configs as the input tensor

    if rows < cols:
        flattened = flattened.transpose()

    # for now, we use numpy to compute the qr factorization
    q, r = np.linalg.qr(flattened.numpy())

    d = np.diag(r, 0)
    ph = np.sign(d)
    q *= ph

    if rows < cols:
        q.transpose()

    return Tensor(q).mul(gain)

def tiny_constant_(tensor: Tensor, val: float):
    """
    """
    return Tensor.ones(tensor.shape) * val
    

from tinygrad import nn 


class TinyPolicy:
    def __init__(self, policy):
        self.policy = policy
        self.is_continuous = hasattr(policy, 'is_continuous') and policy.is_continuous
    
    def __call__(self, x, action=None):
        return self.get_action_and_value(x, action)

    def get_value(self, x, state=None):
        _, value = self.policy(x)
        return value

    def get_action_and_value(self, x, action=None):
        logits, value = self.policy(x)
        action, logprob, entropy = sample_logits(logits, action, self.is_continuous)
        return action, logprob, entropy, value
    
class Critic:
    def __init__(self, obs_size, hidden_size):
        self.l1 = layer_init(tinygrad.nn.Linear(obs_size, hidden_size))
        self.l2 = layer_init(tinygrad.nn.Linear(hidden_size, hidden_size))
        self.l3 = layer_init(tinygrad.nn.Linear(hidden_size, 1))

    def __call__(self, x: Tensor):
        x = self.l1(x).tanh()
        x = self.l2(x).tanh()
        return self.l3(x)

class ActorEncoder:
    def __init__(self, obs_size, hidden_size):
        self.l1 = layer_init(tinygrad.nn.Linear(obs_size, hidden_size))
        self.l2 = layer_init(tinygrad.nn.Linear(hidden_size, hidden_size))

    def __call__(self, x: Tensor):
        x = self.l1(x).tanh()
        return self.l2(x).tanh()

class TinyCleanRLPolicy(TinyPolicy):
    def __init__(self, envs, hidden_size=64):
        super().__init__(policy=None)  # Just to get the right init
        self.is_continuous = True

        # self.obs_size = np.array(envs.single_observation_space.shape).prod()
        # action_size = np.prod(envs.single_action_space.shape)

        ## figuring out how to normalize observations will be an important step, but leaving it out for now
        action_size = 1
        self.obs_size = 1
        self.critic = Critic(self.obs_size, hidden_size)
        self.actor_encoder = ActorEncoder(self.obs_size, hidden_size)
        self.actor_decoder_mean = layer_init(tinygrad.nn.Linear(hidden_size, action_size), std=0.01)
        self.actor_decoder_logstd = Tensor.zeros(1, action_size)



policy = TinyCleanRLPolicy("hypothetical env", hidden_size=64)



In [9]:
nn.state.get_parameters(policy)

[<Tensor <UOp METAL (64, 1) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64,) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64, 64) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64,) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64, 1) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (1,) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64, 1) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64,) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64, 64) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64,) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (64, 1) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp METAL (1,) float (<Ops.MUL: 49>, None)> on METAL with grad None>,
 <Tensor <UOp ME

In [8]:
# Policy loss
import torch
# Create dummy tensors with same shape
mb_advantages = torch.randn(10) # batch size of 10 for example
ratio = torch.ones(10) + torch.randn(10) * 0.5 # Increased variance to 0.5 to make ratios differ more from 1.0

# Fixed clip coefficient to better demonstrate clipping effects
args = type('Args', (), {'clip_coef': 0.2})() # Fixed clip coef for clearer demonstration
print(f"Using clip coefficient: {args.clip_coef}")

pg_loss1 = -mb_advantages * ratio
pg_loss2 = -mb_advantages * ratio.clamp(1 - args.clip_coef, 1 + args.clip_coef)
pg_loss_elementwise = torch.max(pg_loss1, pg_loss2)
print(f"Ratios before clipping: {ratio}")
print(f"Ratios after clipping: {ratio.clamp(1 - args.clip_coef, 1 + args.clip_coef)}")
print(f"pg_loss1: {pg_loss1}, pg_loss2: {pg_loss2}")
print(f"pg_loss_elementwise: {pg_loss_elementwise}")
pg_loss = pg_loss_elementwise.mean()
print(f"pg_loss: {pg_loss}")



Using clip coefficient: 0.2
Ratios before clipping: tensor([0.7646, 1.4082, 1.5149, 0.9010, 0.5702, 0.5831, 1.8986, 1.6615, 1.0109,
        1.0097])
Ratios after clipping: tensor([0.8000, 1.2000, 1.2000, 0.9010, 0.8000, 0.8000, 1.2000, 1.2000, 1.0109,
        1.0097])
pg_loss1: tensor([ 0.5812,  2.2191, -0.3515,  0.2989,  0.3137,  0.2744,  1.3776,  1.1436,
        -0.6129, -0.1148]), pg_loss2: tensor([ 0.6081,  1.8910, -0.2784,  0.2989,  0.4402,  0.3765,  0.8707,  0.8260,
        -0.6129, -0.1148])
pg_loss_elementwise: tensor([ 0.6081,  2.2191, -0.2784,  0.2989,  0.4402,  0.3765,  1.3776,  1.1436,
        -0.6129, -0.1148])
pg_loss: 0.545782208442688


In [12]:
from tinygrad import Tensor

a = Tensor.randn(10)
b = Tensor.randn(10)

print(f"a: {a.numpy()}" )
print(f"b: {b.numpy()}")

print(f"a.max(b): {a.maximum(b).numpy()}")



a: [ 1.2344875   0.4698439   1.2965546  -0.67969835 -1.4338986  -0.6644178
 -0.11526925 -1.2671893   2.7294126  -2.4666617 ]
b: [-1.2414773  -0.2149791  -1.0404463  -1.7843857   0.95054054  0.7913215
  2.2953053   0.7872812  -0.81729704 -0.12775072]
a.max(b): [ 1.2344875   0.4698439   1.2965546  -0.67969835  0.95054054  0.7913215
  2.2953053   0.7872812   2.7294126  -0.12775072]


In [14]:
from tinygrad import nn

layer = nn.Linear(16, 64)