# mDPP

In [1]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.append('../../')

from omegaconf import DictConfig
import os

import torch
from torch.utils.data import DataLoader
import lightning as L

from rl4co.envs import DPPEnv, MDPPEnv
from rl4co.data.dataset import tensordict_collate_fn, TensorDictDataset
from rl4co.models import AttentionModel, AttentionModelPolicy
from rl4co.tasks.rl4co import RL4COLitModule
from rl4co.models.rl.reinforce.baselines import CriticBaseline, RolloutBaseline, WarmupBaseline, ExponentialBaseline
from rl4co.models.rl.reinforce.critic import CriticNetwork

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

  warn(


## mDPP Environment

We declare the environment here. This will automatically download the data 

In [2]:

data_dir = "../../data/"
data_file = "mdpp/mdpp10_test_seed1234.npz" # remember to generate data first


# In the mDPP problem we train directly on the test data as we want to search for best config
# of chip placement

env = MDPPEnv(reward_type="minmax", max_decaps=20, data_dir=data_dir,val_file=data_file, test_file=data_file)

In [3]:
bs = 10


td = env.load_data(os.path.join(data_dir, data_file))
td = env.reset(td)

td_init = td.clone()

actions = []
probes = td['probe'].clone()
keepouts = td['action_mask'].clone() # this includes also the probes + decaps placed later

def random_policy(td):
    """Helper function to select a random action from available actions"""
    action = torch.multinomial(td["action_mask"].float(), 1).squeeze(-1)
    td.set("action", action)
    return td

for i in range(20):
    # pick random action from action_mask
    td = random_policy(td)
    actions.append(td['action'])
    td = env.step(td)['next']

actions_ = torch.stack(actions, dim=1)

# env.render(td[0], actions_[0])

# for i in range(3):
#     env.render(td[i], actions_[i])

## Attention Model

In [4]:
td = env.generate_data([100])
td


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


TensorDict(
    fields={
        action_mask: Tensor(shape=torch.Size([100, 100]), device=cpu, dtype=torch.bool, is_shared=False),
        locs: Tensor(shape=torch.Size([100, 100, 2]), device=cpu, dtype=torch.float32, is_shared=False),
        probe: Tensor(shape=torch.Size([100, 100]), device=cpu, dtype=torch.bool, is_shared=False)},
    batch_size=torch.Size([100]),
    device=None,
    is_shared=False)

In [5]:
# Load environment with test data

dataset = env.dataset(phase="test")

dataloader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=False,  # no need to shuffle, we're resampling every epoch
    num_workers=0,
    collate_fn=tensordict_collate_fn,
)

policy = AttentionModelPolicy(
    env,
)

# model = torch.compile(model)

td = next(iter(dataloader))
td = env.reset(td)

out = policy(td, decode_type="greedy")

print(out)

{'reward': tensor([ 8.0394, 11.8452,  8.5976,  8.9174,  7.8083,  8.4916,  7.5874,  7.8751,
         7.9642,  6.9693, 11.1522,  7.6550, 10.5191,  7.6610,  7.7520,  8.4954,
         8.4003,  3.3802, 10.1701,  7.9172,  7.7835, 10.4013, 10.3987,  7.5925,
         8.2038, 11.6576,  8.8661,  8.0981,  7.7675,  7.4283, 10.0167,  7.5309,
        10.6784,  7.2331,  9.5805,  7.4523,  7.3269,  8.3388,  7.4043,  8.9661,
        10.8520,  7.6057,  8.8645,  7.5868,  8.7847,  9.3867,  8.6568,  7.4560,
         7.7136,  8.7763,  7.3308,  8.2870,  8.5062,  8.8314,  7.9809,  9.0053,
         7.9397,  8.5664,  7.0986,  9.0066,  7.9342,  8.3744,  7.6602,  9.5707]), 'log_likelihood': tensor([-81.3108, -43.8375, -71.3171, -75.7397, -74.5487, -72.9600, -69.3234,
        -59.2570, -63.3589, -68.9168, -56.5742, -69.1350, -67.9623, -66.7593,
        -70.8051, -57.2126, -50.5089, -70.1746, -51.8714, -67.3732, -68.8033,
        -52.8647, -71.9332, -67.2134, -56.0475, -70.5145, -76.7348, -74.3200,
        -66.5085,

In [6]:
model = AttentionModel(
    env,
    policy,
)


td = next(iter(dataloader))
td = env.reset(td)

out = model(td, decode_type="sampling")

print(out)

td = env.reset(td)
init_td = td.clone()


# out = model(td, decode_type="sampling")

# print(out)

{'reward': tensor([ 8.2368, 11.8128,  9.2849,  6.8193, 10.4362,  8.6348,  7.9403,  8.1434,
         8.5038,  5.1087, 10.4515,  7.9250,  8.6036,  8.1896, 10.5730,  9.0818,
         6.8148,  4.3167,  9.1588,  8.9759,  9.4067, 10.2980, 10.2500,  7.9008,
         8.9965,  8.3401,  9.1112,  9.5131,  7.9479,  6.8404,  9.3117,  8.1451,
         9.1867,  6.3442,  9.2355,  9.4083,  7.6151,  8.1510,  8.6830,  9.0115,
        10.3315,  8.5659,  7.1965,  9.0243,  8.5102,  8.0710,  8.5259,  7.5151,
         7.5097,  9.3952,  7.9826,  8.0221,  8.2924,  9.3855,  8.7232,  7.5561,
         8.3399,  8.5177,  7.2788,  9.3284,  7.1690,  7.7732,  9.9620,  9.5456]), 'log_likelihood': tensor([-86.5831, -57.9129, -82.1689, -81.6460, -82.9649, -82.0851, -83.9947,
        -75.3488, -70.9338, -83.0770, -76.4443, -76.4570, -71.9629, -81.8368,
        -82.0169, -69.0171, -65.1886, -78.4407, -72.7699, -77.2605, -79.3863,
        -68.5387, -86.9722, -77.7829, -77.8572, -80.9965, -84.6107, -84.7199,
        -76.3303,

In [7]:
## Plot

probes = td['probe'].clone().cpu()
keepouts = td['action_mask'].clone().cpu()

out = policy(init_td.clone(), decode_type="greedy", return_actions=True)
decaps = out['actions'].cpu()

# for i in range(3):
#     env.render(init_td[i], decaps[i])


# Main setup

In [8]:
config = DictConfig(
    {"data": {
            "train_size": 1000, #1000, # with 1 epochs, this is 1k samples
            "val_size": 100,
            # "batch_size": 64,            # "batch_size": 64,
            "batch_size": 16,
            "val_batch_size": 128,
    },
    "optimizer": {
        "lr": 1e-4,
        # "lr": 3e-5,
        "weight_decay": 1e-4,
    },
    "num_epochs": 10,
    }
)


# Change baseline (optional, defaults to RolloutBaseline)
# Recreate model (so CUDA is initialized in the model)

# baseline = ExponentialBaseline()
baseline = CriticBaseline(CriticNetwork(env))
# baseline = (WarmupBaseline(RolloutBaseline()))

# Recreate model (so CUDA is initialized in the model)
model = AttentionModel(env, policy, baseline=baseline)

# model = AttentionModel(env)

lit_module = RL4COLitModule(cfg=config, env=env, model=model)

In [9]:
# Trainer
trainer = L.Trainer(
    max_epochs=config.num_epochs, # only few epochs
    accelerator="gpu", # use GPU if available, else you can use others as "cpu"
    devices=[0], # GPU number, or multiple GPUs [0, 1, 2, ...]
    logger=None, # can replace with WandbLogger, TensorBoardLogger, etc.
    precision="16-mixed", # Lightning will handle faster training with mixed precision
    gradient_clip_val=1.0, # clip gradients to avoid exploding gradients
    reload_dataloaders_every_n_epochs=1, # necessary for sampling new data
)

# Fit the model
trainer.fit(lit_module)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
No optimizer specified, using default

  | Name  | Type           | Params
-----------------------------------------
0 | env   | MDPPEnv        | 0     
1 | model | AttentionModel | 1.4 M 
-----------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.677     Total estimated model params size (MB)
2023-06-11 22:17:03.998

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  return v.detach(), -F.mse_loss(v, c.detach())
  return v.detach(), -F.mse_loss(v, c.detach())


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


## Results

In [10]:
lit_module.test_batch_size = 100 # so we load the whole test set
dl = lit_module.test_dataloader()
policy = lit_module.model.policy.to("cuda")
batch = next(iter(dl))
td = env.reset(batch).to("cuda")
out = policy(td, decode_type="greedy")
out['reward'].mean()

tensor(9.6373, device='cuda:0')

In [11]:
out['reward'].shape

torch.Size([100])

In [12]:
## Plot

init_td = env.reset(batch).to("cuda")
probes = init_td['probe'].clone().cpu()
keepouts = init_td['action_mask'].clone().cpu()

out = policy(init_td.clone(), decode_type="greedy", return_actions=True)
decaps = out['actions'].cpu()
rewards = out['reward'].cpu()

for i in [33, 50, 17]:
    print("Reward:", rewards[i])
    env.render(init_td[i], actions[i])


Reward: tensor(7.5768)


IndexError: list index out of range