In [49]:
import torch
import time

from torchrl.envs import GymEnv, StepCounter, TransformedEnv
from tensordict.nn import TensorDictModule as Mod, TensorDictSequential as Seq
from torchrl.modules import EGreedyModule, MLP, QValueModule
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer
from torch.optim import Adam
from torchrl.objectives import DQNLoss, SoftUpdate
from torchrl._utils import logger as torchrl_logger
from torchrl.record import CSVLogger, VideoRecorder
from torchrl.modules import QValueActor
from torchrl.data import CompositeSpec

torch.manual_seed(0)


<torch._C.Generator at 0x7f48e401c030>

In [50]:
# Define the environment
env = TransformedEnv(GymEnv("CartPole-v1"), StepCounter())
env.set_seed(0)



795726461

In [51]:
class MICOMLPNetwork(torch.nn.Module):
    def __init__(self,
                 in_features,
                 activation_class, 
                 encoder_out_features,
                 mlp_out_features,
                 encoder_num_cells = None,
                 mlp_num_cells = None):
        super(MICOMLPNetwork, self).__init__()

        self.activation = activation_class()

        if encoder_num_cells is None:
            encoder_num_cells = []
        layers_sizes = [in_features] + encoder_num_cells + [encoder_out_features]

        self.encoder = torch.nn.ModuleList()
        for i in range(len(layers_sizes) - 1):
            self.encoder.append(torch.nn.Linear(layers_sizes[i], layers_sizes[i+1]))

        if mlp_num_cells is None:
            mlp_num_cells = []

        layers_sizes = [encoder_out_features] + mlp_num_cells + [mlp_out_features.item()]

        self.q_net = torch.nn.ModuleList()
        for i in range(len(layers_sizes) - 1):
            self.q_net.append(torch.nn.Linear(layers_sizes[i], layers_sizes[i+1]))
        
    
    def forward(self, x):
        for i in range(len(self.encoder)):
            x = self.activation(self.encoder[i](x))

        representation = x

        for i in range(len(self.q_net)-1):
            x = self.activation(self.q_net[i](x))

        return self.q_net[-1](x), representation

In [52]:
value_mlp = MICOMLPNetwork(
    in_features=4,
    activation_class=torch.nn.ReLU,
    encoder_out_features=3,
    mlp_out_features=env.action_spec.shape[-1],
    encoder_num_cells=[64],
    mlp_num_cells=[64]
)

value_mlp

MICOMLPNetwork(
  (activation): ReLU()
  (encoder): ModuleList(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=3, bias=True)
  )
  (q_net): ModuleList(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [53]:
q_values, representation = value_mlp(torch.randn(1, 4))
print(q_values)
print(representation)

tensor([[-0.0332,  0.2120]], grad_fn=<AddmmBackward0>)
tensor([[0.4680, 0.0689, 0.4621]], grad_fn=<ReluBackward0>)


In [54]:
value_net = Mod(value_mlp, 
                in_keys=["observation"], 
                out_keys=["action_value", "representation"])
value_net

TensorDictModule(
    module=MICOMLPNetwork(
      (activation): ReLU()
      (encoder): ModuleList(
        (0): Linear(in_features=4, out_features=64, bias=True)
        (1): Linear(in_features=64, out_features=3, bias=True)
      )
      (q_net): ModuleList(
        (0): Linear(in_features=3, out_features=64, bias=True)
        (1): Linear(in_features=64, out_features=2, bias=True)
      )
    ),
    device=cpu,
    in_keys=['observation'],
    out_keys=['action_value', 'representation'])

In [55]:
# policy = Seq(value_net, 
#              QValueModule(spec=env.action_spec))
# policy

policy = QValueActor(
    module=value_net,
    spec=CompositeSpec(action= env.specs["input_spec", "full_action_spec", "action"]),
    in_keys=["observation"],
)
policy

QValueActor(
    module=ModuleList(
      (0): TensorDictModule(
          module=MICOMLPNetwork(
            (activation): ReLU()
            (encoder): ModuleList(
              (0): Linear(in_features=4, out_features=64, bias=True)
              (1): Linear(in_features=64, out_features=3, bias=True)
            )
            (q_net): ModuleList(
              (0): Linear(in_features=3, out_features=64, bias=True)
              (1): Linear(in_features=64, out_features=2, bias=True)
            )
          ),
          device=cpu,
          in_keys=['observation'],
          out_keys=['action_value', 'representation'])
      (1): QValueModule()
    ),
    device=cpu,
    in_keys=['observation'],
    out_keys=['representation', 'action', 'action_value', 'chosen_action_value'])

In [56]:
# Define the exploration step (e-greedy policy)
exploration_module = EGreedyModule(
    env.action_spec, 
    annealing_num_steps=100_000, 
    eps_init=0.1,
)
policy_explore = Seq(policy, 
                     exploration_module)
policy_explore

TensorDictSequential(
    module=ModuleList(
      (0): QValueActor(
          module=ModuleList(
            (0): TensorDictModule(
                module=MICOMLPNetwork(
                  (activation): ReLU()
                  (encoder): ModuleList(
                    (0): Linear(in_features=4, out_features=64, bias=True)
                    (1): Linear(in_features=64, out_features=3, bias=True)
                  )
                  (q_net): ModuleList(
                    (0): Linear(in_features=3, out_features=64, bias=True)
                    (1): Linear(in_features=64, out_features=2, bias=True)
                  )
                ),
                device=cpu,
                in_keys=['observation'],
                out_keys=['action_value', 'representation'])
            (1): QValueModule()
          ),
          device=cpu,
          in_keys=['observation'],
          out_keys=['representation', 'action', 'action_value', 'chosen_action_value'])
      (1): EGreedyModule()
   

In [57]:
# Define how to collect the data (experiences)
init_rand_steps = 5000 # warm-up steps
frames_per_batch = 100
optim_steps = 10
replay_capacity = 100_000

# NOTE: collector will gather rollouts continously
# If the current trajectory ends, it will start a new one
# NOTE: the rollout gotten from the collector is a dictionary
# that defines the sate and next state as a tensor with a batch dimension in the begining
# for example a rollout of 10 steps will have a tensor of observation of 10 in the batch dimension
# and the next will also have 10 which are all the tensors of the next state
# Practically, next is as you will shift the tensor of observation by one step
# collector = SyncDataCollector(
#     env,
#     policy_explore,
#     frames_per_batch=frames_per_batch,
#     total_frames=500_100,
#     init_random_frames=init_rand_steps,
# )
# rb = ReplayBuffer(storage=LazyTensorStorage(replay_capacity))

In [58]:
# Define the recording and logging
path = "./training_loop"
logger = CSVLogger(exp_name="dqn", log_dir=path, video_format="mp4")
video_recorder = VideoRecorder(logger, tag="video")
record_env = TransformedEnv(
    GymEnv("CartPole-v1", from_pixels=True, pixels_only=False), video_recorder
)



In [59]:
# collector = SyncDataCollector(
#     env,
#     policy_explore,
#     frames_per_batch=10,
#     total_frames=500_100,
#     init_random_frames=10000,
# )

collector = SyncDataCollector(
    create_env_fn=env,
    policy=policy_explore,
    frames_per_batch=10,
    total_frames=100,
    device="cpu",
    storing_device="cpu",
    max_frames_per_traj=-1
)
# NOTE: IMPORTANTISIMO en las primeras iteraciones no se usa la policy, entonces representation se configura
# a zero, por lo que el primer batch de datos no tiene representation
# Tengo que hacer el warm-up de otra manera (ojo con esto)

for data in collector:
    print(data['representation'])
    break

tensor([[0.1717, 0.1063, 0.0000],
        [0.1425, 0.0952, 0.0061],
        [0.0912, 0.0871, 0.0460],
        [0.0490, 0.0752, 0.0896],
        [0.0580, 0.0906, 0.1324],
        [0.0857, 0.0904, 0.1998],
        [0.1152, 0.0904, 0.2713],
        [0.1495, 0.0832, 0.3370],
        [0.1411, 0.0853, 0.3108],
        [0.1740, 0.0766, 0.3788]])


In [60]:
data['representation']

tensor([[0.1717, 0.1063, 0.0000],
        [0.1425, 0.0952, 0.0061],
        [0.0912, 0.0871, 0.0460],
        [0.0490, 0.0752, 0.0896],
        [0.0580, 0.0906, 0.1324],
        [0.0857, 0.0904, 0.1998],
        [0.1152, 0.0904, 0.2713],
        [0.1495, 0.0832, 0.3370],
        [0.1411, 0.0853, 0.3108],
        [0.1740, 0.0766, 0.3788]])

In [62]:
from torchrl.data import SliceSampler
from torchrl.data import TensorDictReplayBuffer

size = 100
rb = TensorDictReplayBuffer(
    storage=LazyTensorStorage(size),
    sampler=SliceSampler(traj_key=("collector","traj_ids"), slice_len=2),
    batch_size=10,
)
rb

TensorDictReplayBuffer(
    storage=LazyTensorStorage(
        data=<empty>, 
        shape=None, 
        len=0, 
        max_size=100), 
    sampler=SliceSampler(num_slices=None, slice_len=2, end_key=('next', 'done'), traj_key=('collector', 'traj_ids'), truncated_key=('next', 'truncated'), strict_length=True), 
    writer=TensorDictRoundRobinWriter(cursor=0, full_storage=False), 
    batch_size=10, 
    collate_fn=<function _collate_id at 0x7f47f936bb00>)

In [63]:
data['collector','traj_ids']

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [64]:
rb.extend(data)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [65]:
sample = rb.sample(10)

In [66]:
sample

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([10]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        index: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([10

In [67]:
sample['step_count']

tensor([[7],
        [8],
        [0],
        [1],
        [7],
        [8],
        [0],
        [1],
        [5],
        [6]])

In [70]:
sample["observation"]

tensor([[ 0.1190,  1.4084, -0.1088, -2.0614],
        [ 0.1472,  1.2145, -0.1500, -1.8043],
        [ 0.0313,  0.0413,  0.0107,  0.0229],
        [ 0.0322,  0.2362,  0.0111, -0.2663],
        [ 0.1190,  1.4084, -0.1088, -2.0614],
        [ 0.1472,  1.2145, -0.1500, -1.8043],
        [ 0.0313,  0.0413,  0.0107,  0.0229],
        [ 0.0322,  0.2362,  0.0111, -0.2663],
        [ 0.0745,  1.0168, -0.0451, -1.4403],
        [ 0.0948,  1.2125, -0.0739, -1.7467]])

In [72]:
sample["next", "observation"]

tensor([[ 0.1472,  1.2145, -0.1500, -1.8043],
        [ 0.1715,  1.4110, -0.1861, -2.1396],
        [ 0.0322,  0.2362,  0.0111, -0.2663],
        [ 0.0369,  0.4312,  0.0058, -0.5555],
        [ 0.1472,  1.2145, -0.1500, -1.8043],
        [ 0.1715,  1.4110, -0.1861, -2.1396],
        [ 0.0322,  0.2362,  0.0111, -0.2663],
        [ 0.0369,  0.4312,  0.0058, -0.5555],
        [ 0.0948,  1.2125, -0.0739, -1.7467],
        [ 0.1190,  1.4084, -0.1088, -2.0614]])

In [68]:
sample['representation']

tensor([[0.1495, 0.0832, 0.3370],
        [0.1411, 0.0853, 0.3108],
        [0.1717, 0.1063, 0.0000],
        [0.1425, 0.0952, 0.0061],
        [0.1495, 0.0832, 0.3370],
        [0.1411, 0.0853, 0.3108],
        [0.1717, 0.1063, 0.0000],
        [0.1425, 0.0952, 0.0061],
        [0.0857, 0.0904, 0.1998],
        [0.1152, 0.0904, 0.2713]])

In [74]:
sample

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([10]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        index: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([10

In [79]:
sample['representation']

first_states = sample[0::2] # even rows
second_states = sample[1::2] # odd rows (or next states)

print(first_states)
print(second_states)

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([5, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([5, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([5]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        index: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.int64, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([5, 4]), de

In [80]:
first_states['representation']

tensor([[0.1495, 0.0832, 0.3370],
        [0.1717, 0.1063, 0.0000],
        [0.1495, 0.0832, 0.3370],
        [0.1717, 0.1063, 0.0000],
        [0.0857, 0.0904, 0.1998]])

In [81]:
second_states['representation']

tensor([[0.1411, 0.0853, 0.3108],
        [0.1425, 0.0952, 0.0061],
        [0.1411, 0.0853, 0.3108],
        [0.1425, 0.0952, 0.0061],
        [0.1152, 0.0904, 0.2713]])

In [83]:
# NOTE: Checar la distancia euclidean entre la representacion target y la representacion con
# la politica actual

# NOTE: en el repositorio de MICO, la distancia target es calculada con
# la target network (que es una copia de la politica actual) osea mis representaciones guardadas
# la distancia online por otra parte es calculada con una representacion con red actual
# y una representacion target


collector.policy(first_states['observation'])

(tensor([[0.1495, 0.0832, 0.3370],
         [0.1717, 0.1063, 0.0000],
         [0.1495, 0.0832, 0.3370],
         [0.1717, 0.1063, 0.0000],
         [0.0857, 0.0904, 0.1998]]),
 tensor([[0.0215, 0.2401],
         [0.0836, 0.2269],
         [0.0215, 0.2401],
         [0.0836, 0.2269],
         [0.0543, 0.2416]]),
 tensor([[0.2401],
         [0.2269],
         [0.2401],
         [0.2269],
         [0.2416]]),
 tensor([[0, 1],
         [0, 1],
         [0, 1],
         [0, 1],
         [0, 1]]))

In [None]:
import torch

# Create a sample tensor of shape [10, 3]
tensor = torch.randn(10, 3)

# Separate odd and even rows
even_rows = tensor[0::2]
odd_rows = tensor[1::2]

print("Even rows:\n", even_rows)
print("Odd rows:\n", odd_rows)


In [425]:
loss = DQNLoss(value_network=policy, 
               action_space=env.action_spec, 
               delay_value=True) # delay_value=True means we will use a target network
optim = Adam(loss.parameters(), lr=0.02)

# eps: will be used to update the target network as 
# \theta_t = \theta_{t-1} * \epsilon + \theta_t * (1-\epsilon)
# where eps = 1 is hard update
updater = SoftUpdate(loss, eps=0.99)

In [80]:
from tensordict import TensorDict
from torchrl.data import SliceSampler
from torchrl.data import LazyMemmapStorage

rb = TensorDictReplayBuffer(
    storage=LazyMemmapStorage(size),
    sampler=SliceSampler(traj_key="episode", num_slices=4),
    batch_size=8,
)
episode = torch.zeros(10, dtype=torch.int)
episode[:3] = 1
episode[3:5] = 2
episode[5:7] = 3
episode[7:] = 4
steps = torch.cat([torch.arange(3), torch.arange(2), torch.arange(2), torch.arange(3)])
obs = torch.randn((3, 4, 5)).expand(10, 3, 4, 5)
data = TensorDict(
    {
        "episode": episode,
        "obs": obs,
        "act": torch.randn((20,)).expand(10, 20),
        "other": torch.randn((20, 50)).expand(10, 20, 50),
        "steps": steps,
    },
    [10],
)
rb.extend(data)
sample = rb.sample()
print("episode are grouped", sample["episode"])
print("steps are successive", sample["steps"])

episode are grouped tensor([3, 3, 4, 4, 2, 2, 1, 1], dtype=torch.int32)
steps are successive tensor([0, 1, 0, 1, 0, 1, 0, 1])


In [82]:
episode

tensor([1, 1, 1, 2, 2, 3, 3, 4, 4, 4], dtype=torch.int32)

In [81]:
rb

TensorDictReplayBuffer(
    storage=LazyMemmapStorage(
        data=TensorDict(
            fields={
                act: MemoryMappedTensor(shape=torch.Size([10, 20]), device=cpu, dtype=torch.float32, is_shared=False),
                episode: MemoryMappedTensor(shape=torch.Size([10]), device=cpu, dtype=torch.int32, is_shared=False),
                index: MemoryMappedTensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False),
                obs: MemoryMappedTensor(shape=torch.Size([10, 3, 4, 5]), device=cpu, dtype=torch.float32, is_shared=False),
                other: MemoryMappedTensor(shape=torch.Size([10, 20, 50]), device=cpu, dtype=torch.float32, is_shared=False),
                steps: MemoryMappedTensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([10]),
            device=cpu,
            is_shared=False), 
        shape=torch.Size([10]), 
        len=10, 
        max_size=100), 
    sampler=S

In [234]:
import datetime

current_date = datetime.datetime.now()
date_str = current_date.strftime("%Y_%m_%d-%H_%M_%S")  # Includes date and time
date_str

'2024_07_23-17_34_50'

In [235]:
total_count = 0
total_episodes = 0
t0 = time.time()
for i, data in enumerate(collector):
    # Write data in replay buffer
    rb.extend(data)
    max_length = rb[:]["next", "step_count"].max() # From all the next steps get the max step count
    if len(rb) > init_rand_steps: # wam-up steps
        # Optim loop (we do several optim steps
        # per batch collected for efficiency)
        for _ in range(optim_steps):
            sample = rb.sample(128) # sample a batch of 128 (repetition is allowed)
            # print(sample)
            break
            loss_vals = loss(sample)
            loss_vals["loss"].backward()
            optim.step()
            optim.zero_grad()
            # Update exploration factor
            # NOTE: Why I am updating the exploration factor here? 
            # I'm considering practically that I did 100 (or n) iteractions in the environment time optim_steps
            exploration_module.step(data.numel()) # data.numel() returns the number of elements in the data
            # Update target params each optimisation step
            updater.step()
            if i % 10:
                torchrl_logger.info(f"Max num steps: {max_length}, rb length {len(rb)}")
            total_count += data.numel()
            total_episodes += data["next", "done"].sum() # sum the number of done episodes
    
    if max_length > 200:
        break

t1 = time.time()

torchrl_logger.info(
    f"solved after {total_count} steps, {total_episodes} episodes and in {t1-t0}s."
)

2024-07-23 17:34:53,433 [torchrl][INFO] solved after 0 steps, 0 episodes and in 2.57519268989563s.


In [242]:
sample[0]

TensorDict(
    fields={
        _weight: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
        action: Tensor(shape=torch.Size([2]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([2]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        index: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.int64, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_sha

In [236]:
sample

TensorDict(
    fields={
        _weight: Tensor(shape=torch.Size([128]), device=cpu, dtype=torch.float32, is_shared=False),
        action: Tensor(shape=torch.Size([128, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([128, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([128, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([128]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([128]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([128, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        index: Tensor(shape=torch.Size([128]), device=cpu, dtype=torch.int64, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([128, 1]),

In [8]:
record_env.rollout(max_steps=1000, policy=policy)
video_recorder.dump()

In [5]:
import random

# Generate and print 10 random seeds
random_seeds = [random.randint(0, 1000000) for _ in range(10)]
print(random_seeds)

[118398, 676190, 786456, 171936, 887739, 919409, 711872, 442081, 189061, 117840]
