In [1]:
import torch
import torch.nn.functional as F

In [2]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


In [3]:
from envs.env import D2DEnv
from algorithms.baselines import EarliestDeadlineFirstScheduler, GFAccess
from algorithms.ippo import iPPO
from algorithms.d2d_ppo import D2DPPO

In [4]:
Tf_gf = 4*(1 / 30 * 1e-3 + 2.34e-6)
1e-3 / Tf_gf

7.008035881143711

In [5]:
n_agents = 4
deadlines = np.array([7]*n_agents)
lbdas = np.array([1/14]*n_agents)
period = None
arrival_probs = None
offsets = None
#neighbourhoods = [list(range(n_agents)) for k in range(n_agents)]
neighbourhoods = [[i] for i in range(n_agents)]


In [6]:
env = D2DEnv(n_agents,
                deadlines,
                lbdas,
                period=period,
                arrival_probs=arrival_probs,
                offsets=offsets,
                episode_length=200,
                traffic_model='aperiodic',
                periodic_devices=[],
                reward_type=0,
                channel_switch=0,
                channel_decoding=1.,
                neighbourhoods=neighbourhoods, # Neighbourhoods is a list of size n_agents with the indices of the neighbours for each agent.
                verbose=False)

In [8]:
edf = EarliestDeadlineFirstScheduler(env, use_channel=False, verbose=True)


In [9]:
res_edf = edf.run(500)


Number of received packets: 71779.0
Number of channel_losses: 0


In [10]:
print(f"URLLC score: {res_edf[0]}")
print(f"Jain's index: {res_edf[1]}")
print(f"Channel errors: {res_edf[2]}")
print(f"Reward per episode: {res_edf[3]}")


URLLC score: 0.9967957201967149
Jain's index: 0.9997862827685239
Channel errors: 0
Reward per episode: 1414.96


In [11]:
gf = GFAccess(env, use_channel=False)


In [12]:
cv = gf.get_best_transmission_probs(100)
gf.transmission_prob = gf.transmission_prob_list[np.argmax(cv)]
print(f"Transmission probabilities: {gf.transmission_prob_list} \nURLLC scores: {cv}")


Transmission probabilities: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] 
URLLC scores: [0.3666482224374049, 0.5064612326043738, 0.5461776714064479, 0.5122845871686622, 0.45994993742177726, 0.38629416364541125, 0.2977648605632828, 0.23084384093113486, 0.16863864142538976, 0.11496897441260545]


In [14]:
res_gf = gf.run(500)


In [15]:
print(f"URLLC score: {res_gf[0]}")
print(f"Jain's index: {res_gf[1]}")
print(f"Channel errors: {res_gf[2]}")
print(f"Reward per episode: {res_gf[3]}")


URLLC score: 0.5337340886740447
Jain's index: 0.9426200635570335
Channel errors: 0
Reward per episode: 271.32


In [18]:
env.action_space[0].n


2

In [32]:
observations = {f"{i}": [] for i in range(n_agents)}
obs, (buffer_state, channel_state) = env.reset()
done = False
while not done:
    for i in range(n_agents):
        obs_agent = torch.tensor(obs[i], dtype=torch.float)
        observations[str(i)].append(obs_agent)
    action_agent = []
    log_prob_agent = []
    # entropy_agent = []
    value_agent = []
    actions = np.array(env.action_space.sample())
    next_obs, next_state, reward, done, _ = env.step(actions)
    obs = next_obs
    

In [37]:
torch.stack(observations['0'])


tensor([[0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 1., 0.]])

In [7]:
ippo = iPPO(env)


In [8]:
ippo.test(10)

(0.9454191684465589, 0.9973912454640974, 0, 24.6)

In [9]:
ippo.train(5000, 15)

Episode: 0, mean score rollout: 0.9749857076825769 Score test: (0.9610333616487122, 0.9988356306405209, 0, 26.66)
Episode: 100, mean score rollout: 0.9937134502923978 Score test: (0.9927645670759262, 0.99970229538208, 0, 27.54)
Episode: 200, mean score rollout: 1.0 Score test: (0.9981691368788143, 0.9998990856009695, 0, 28.08)
Episode: 300, mean score rollout: 1.0 Score test: (1.0, 1.0, 0, 28.14)
Episode: 400, mean score rollout: 1.0 Score test: (1.0, 1.0, 0, 27.88)


KeyboardInterrupt: 

In [65]:
ippo = iPPO(env, useRNN=True, history_len=4)


In [66]:
ippo.test(10)

(0.6206534018161278, 0.7302501071525931, 0, 30.6)

In [67]:
res = ippo.train(5000, 10, 500)

Episode: 0, mean score rollout: 0.9007565042566789 Score test: (0.7286679489645607, 0.7696122392763901, 0, 35.66)


KeyboardInterrupt: 

In [10]:
observations = {f"{i}": [] for i in range(n_agents)}
obs, (buffer_state, channel_state) = env.reset()
done = False
while not done:
    for i in range(n_agents):
        obs_agent = torch.tensor(obs[i], dtype=torch.float)
        observations[str(i)].append(obs_agent)
    action_agent = []
    log_prob_agent = []
    # entropy_agent = []
    value_agent = []
    actions = np.array(env.action_space.sample())
    next_obs, next_state, reward, done, _ = env.step(actions)
    obs = next_obs
    

In [9]:
res = ippo.create_rollouts(10)

In [10]:
obs = res[0]

In [13]:
obs[2].shape

torch.Size([2000, 33])

In [21]:
obs_agent_rnn = ippo.preprocess_input_for_rnn(obs[2])
actions = res[1][:, 2]

In [24]:
ippo.agents[2].evaluate(obs_agent_rnn, actions)

(tensor([-0.3803, -0.2641, -0.2468,  ..., -0.2348, -1.2753, -1.0643],
        grad_fn=<SqueezeBackward1>),
 tensor([0.6241, 0.5418, 0.5253,  ..., 0.5130, 0.5923, 0.6443],
        grad_fn=<NegBackward0>))

In [28]:
ippo.agents[2].value_network(obs_agent_rnn).squeeze()

tensor([-0.0865,  0.0634, -0.1090,  ..., -0.5924, -0.6745, -0.8581],
       grad_fn=<SqueezeBackward0>)

In [58]:
obs_agent = res[0][0]

In [60]:
205%200

5

In [59]:
obs_agent[205]

tensor([[0., 0., 0.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 1., 1., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 1., 1., 0.]])

In [19]:
ippo.agents[0].policy_network(torch.stack(observations['0'])[-4:])

tensor([[0.7539, 0.2461]], grad_fn=<SoftmaxBackward0>)

In [21]:
res = ippo.create_rollouts(10)

In [26]:
res[0][0].shape

torch.Size([2000, 33])

In [33]:
450%200

50

In [130]:
# How to create new train tensor.
history_len = 10
obsss = []
for i in range(obs_agent.size(0)):
    idx = i % 200
    if idx < history_len:
        x = obs_agent[i-idx:i+1]
        pad_len = history_len - (idx + 1)
        x = torch.cat([torch.zeros((pad_len, 33)), x])
    else:
        x = obs_agent[i+1-history_len:i+1]
    obsss.append(x)


In [132]:
obsss = torch.stack(obsss)

In [134]:
obsss.shape

torch.Size([2000, 10, 33])

In [95]:
obs_agent[7]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.])

In [99]:
obs_agent[7+1-7:8]

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 

In [94]:
obs_agent[16-10:15+1].shape

torch.Size([10, 33])

In [87]:
x.shape

torch.Size([1, 33])

In [90]:
torch.cat([torch.zeros((9, 33)), x]).shape

torch.Size([10, 33])

In [29]:
test_tensor = torch.rand(600, 33)

In [30]:
ptest_tensor = ippo.preprocess_input_for_rnn(test_tensor)

In [31]:
ptest_tensor.shape

torch.Size([600, 10, 33])

In [57]:
i=222
idx = i % 200

In [58]:
test_tensor[i+1-10:i+1]

tensor([[3.3997e-01, 2.5090e-01, 6.6909e-01, 2.1557e-01, 6.5758e-01, 5.7883e-01,
         3.2858e-01, 8.5916e-01, 2.8693e-01, 4.7698e-01, 7.1023e-01, 4.0618e-01,
         1.3866e-03, 5.1685e-01, 3.4905e-01, 2.7537e-01, 8.2082e-01, 7.2810e-01,
         8.6704e-01, 4.4474e-01, 3.9635e-01, 9.6113e-01, 2.7664e-01, 3.8474e-01,
         7.0258e-01, 8.7287e-01, 9.5606e-01, 3.8938e-01, 1.4408e-01, 5.1847e-01,
         2.1651e-01, 1.7511e-01, 8.8285e-01],
        [8.3161e-01, 3.7239e-01, 2.0540e-01, 3.3372e-01, 6.2524e-01, 4.5304e-01,
         8.8041e-01, 5.6246e-01, 2.5592e-01, 4.7346e-01, 9.1162e-01, 6.5149e-01,
         5.4818e-01, 1.4216e-01, 8.7387e-01, 5.9864e-02, 2.0658e-01, 9.7297e-01,
         6.4315e-01, 6.3057e-01, 7.7600e-01, 7.1735e-01, 4.0787e-01, 3.5411e-01,
         4.8692e-01, 7.5918e-01, 6.8027e-01, 9.6588e-01, 4.3785e-01, 6.9255e-01,
         4.7342e-01, 2.8989e-01, 3.3691e-01],
        [8.2131e-01, 2.4383e-01, 4.0129e-01, 2.3240e-01, 7.4122e-01, 7.1310e-01,
         2.9456e-

In [59]:
ptest_tensor[i,:,:]

tensor([[3.3997e-01, 2.5090e-01, 6.6909e-01, 2.1557e-01, 6.5758e-01, 5.7883e-01,
         3.2858e-01, 8.5916e-01, 2.8693e-01, 4.7698e-01, 7.1023e-01, 4.0618e-01,
         1.3866e-03, 5.1685e-01, 3.4905e-01, 2.7537e-01, 8.2082e-01, 7.2810e-01,
         8.6704e-01, 4.4474e-01, 3.9635e-01, 9.6113e-01, 2.7664e-01, 3.8474e-01,
         7.0258e-01, 8.7287e-01, 9.5606e-01, 3.8938e-01, 1.4408e-01, 5.1847e-01,
         2.1651e-01, 1.7511e-01, 8.8285e-01],
        [8.3161e-01, 3.7239e-01, 2.0540e-01, 3.3372e-01, 6.2524e-01, 4.5304e-01,
         8.8041e-01, 5.6246e-01, 2.5592e-01, 4.7346e-01, 9.1162e-01, 6.5149e-01,
         5.4818e-01, 1.4216e-01, 8.7387e-01, 5.9864e-02, 2.0658e-01, 9.7297e-01,
         6.4315e-01, 6.3057e-01, 7.7600e-01, 7.1735e-01, 4.0787e-01, 3.5411e-01,
         4.8692e-01, 7.5918e-01, 6.8027e-01, 9.6588e-01, 4.3785e-01, 6.9255e-01,
         4.7342e-01, 2.8989e-01, 3.3691e-01],
        [8.2131e-01, 2.4383e-01, 4.0129e-01, 2.3240e-01, 7.4122e-01, 7.1310e-01,
         2.9456e-

In [136]:
tt = test_tensor[:204]

In [142]:
tt[1*200:][-10:].shape

torch.Size([4, 33])

In [144]:
tt[200:]

tensor([[0.4396, 0.7082, 0.8753, 0.5915, 0.4299, 0.4796, 0.5322, 0.4846, 0.7730,
         0.6790, 0.8913, 0.0404, 0.9299, 0.2975, 0.6836, 0.3075, 0.1434, 0.5101,
         0.0113, 0.5184, 0.0421, 0.5130, 0.3983, 0.9136, 0.6495, 0.4231, 0.2541,
         0.6904, 0.3265, 0.6928, 0.1542, 0.7361, 0.7293],
        [0.7271, 0.0524, 0.4237, 0.2834, 0.1329, 0.7335, 0.6233, 0.3134, 0.2835,
         0.5443, 0.7507, 0.9358, 0.7848, 0.2041, 0.9281, 0.2749, 0.1057, 0.4959,
         0.2463, 0.2948, 0.0626, 0.0556, 0.7395, 0.4210, 0.6991, 0.8830, 0.2444,
         0.1072, 0.3953, 0.2584, 0.5111, 0.8885, 0.1013],
        [0.6505, 0.7177, 0.8398, 0.5211, 0.2072, 0.2995, 0.5196, 0.2611, 0.4052,
         0.3233, 0.8528, 0.7261, 0.1786, 0.0047, 0.9422, 0.2393, 0.4268, 0.5018,
         0.2214, 0.5232, 0.4006, 0.1108, 0.6751, 0.4846, 0.0134, 0.8402, 0.0410,
         0.7518, 0.4162, 0.3657, 0.9628, 0.1467, 0.4264],
        [0.1031, 0.5406, 0.4012, 0.7052, 0.1120, 0.6715, 0.8546, 0.5452, 0.0333,
         0.2214,

In [13]:
d2dppo = D2DPPO(env,
                hidden_size=64, 
                gamma=0.5, 
                policy_lr=1e-3,
                value_lr=1e-3,
                device=None,
                useRNN=True,
                history_len=4,
                early_stopping=True)

In [None]:
res = d2dppo.train(num_iter=5000, num_episodes=10, n_epoch=4, test_freq=100)

Episode: 0, mean score rollout: 0.8432160723476023 Score test: (0.5189580597977534, 0.5776946575762987, 0, 24.72)
Episode: 0, mean score rollout: 0.8432160723476023 Score test: (0.47230656837210394, 0.5415515959971685, 0, 24.1)
Episode: 0, mean score rollout: 0.8432160723476023 Score test: (0.3620679944765543, 0.41646154896990883, 0, 18.78)
Episode: 0, mean score rollout: 0.8432160723476023 Score test: (0.33270798521195966, 0.4304276776157867, 0, 16.4)
Episode: 100, mean score rollout: 0.9559968809912401 Score test: (0.948269703131897, 0.9949430440197036, 0, 45.22)
Episode: 100, mean score rollout: 0.9559968809912401 Score test: (0.9326634192347042, 0.9940799581918733, 0, 44.9)
Episode: 100, mean score rollout: 0.9559968809912401 Score test: (0.9526048676018298, 0.9959305811931519, 0, 45.84)
Episode: 100, mean score rollout: 0.9559968809912401 Score test: (0.9638304517698864, 0.9971888100778883, 0, 46.3)
Episode: 200, mean score rollout: 0.9406259012491329 Score test: (0.96645008794908

Episode: 1700, mean score rollout: 0.976344201625394 Score test: (0.9854345548573292, 0.999032779506075, 0, 48.26)
Episode: 1800, mean score rollout: 0.9733869235366669 Score test: (0.9803461287944915, 0.9986786850667125, 0, 49.48)
Episode: 1800, mean score rollout: 0.9733869235366669 Score test: (0.9805001774387392, 0.9984096819275083, 0, 49.86)
Episode: 1800, mean score rollout: 0.9733869235366669 Score test: (0.9834065476438009, 0.9984628593263254, 0, 49.56)
Episode: 1800, mean score rollout: 0.9733869235366669 Score test: (0.9790783226722045, 0.9985118542987058, 0, 48.8)
Episode: 1900, mean score rollout: 0.9832148883647625 Score test: (0.9858932809511433, 0.9990095919293868, 0, 48.5)
Episode: 1900, mean score rollout: 0.9832148883647625 Score test: (0.9800005297602407, 0.9988358306943357, 0, 49.48)
Episode: 1900, mean score rollout: 0.9832148883647625 Score test: (0.9821639786707024, 0.9983334734930548, 0, 48.46)
Episode: 1900, mean score rollout: 0.9832148883647625 Score test: (0

In [12]:
d2dppo.test(100)

(0.9736676050550482, 0.9970913584478833, 0, 48.02)

In [8]:
obs, states, actions, log_probs_old, rewards, returns, scores, dones = d2dppo.create_rollouts(10)

In [10]:
def compute_gae(rewards, dones, values, gamma, lbda=0.95):
    gae = 0
    adv = [rewards[-1] - values[-1]]
    for step in reversed(range(len(rewards)-1)):
        delta = rewards[step] + gamma * values[step + 1] * (1-dones[step]) - values[step]
        gae = delta + gamma * lbda * (1-dones[step]) * gae
        adv.insert(0, gae + values[step])
    adv = np.array(adv)
    if (adv.std(0) > 0).all():
        adv = (adv - adv.mean(0)) / adv.std(0)
    return torch.tensor(adv, dtype=torch.float)


cycle = np.arange(4)
np.random.shuffle(cycle)

# Compute global advantage estimation at the BS
values = d2dppo.value_network(states.to(d2dppo.device)).squeeze()
advantages = compute_gae(rewards, dones, values.cpu().detach().numpy(), d2dppo.gamma, 0.97)
M = advantages

ploss_agents = []

In [37]:
ploss_agents = []

i = 0
obs_agent_rnn = d2dppo.preprocess_input_for_rnn(obs[i])
ploss, M = d2dppo.agents[i].train_step(obs_agent_rnn, actions[:,i], log_probs_old[:, i], M.detach())



In [35]:
ploss

-0.006541317794471979

In [36]:
M

tensor([-0.3867, -0.2582, -0.1011,  ..., -0.5232, -0.6139, -1.5107],
       grad_fn=<MulBackward0>)

In [33]:
res[4].mean(1) 

tensor([ 0.9702, -0.6729, -0.6734,  ..., -0.6724, -0.6724, -0.6724])

In [25]:
d2dppo.value_network(res[1]).squeeze()

tensor([-0.0406, -0.1818, -0.1940,  ..., -0.1940, -0.1940, -0.1940],
       grad_fn=<SqueezeBackward0>)

In [38]:
np.random.choice(np.arange(10), replace=False, size=(10))

array([1, 8, 6, 7, 2, 5, 4, 3, 9, 0])

In [39]:
x = np.arange(10)

In [40]:
np.random.shuffle(x)

In [41]:
x

array([5, 8, 6, 4, 2, 0, 3, 1, 9, 7])