In [1]:
import numpy as np
import pickle
import random
import torch

In [2]:
from algorithms.noma_ppo import *
from nomaenv import NomaEnv

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

In [3]:
k = 12
deadlines = np.array([5 for _ in range(k)])
offsets = None
arrival_probs = None
period = None

env = NomaEnv(k,
              deadlines,
              lbda=1/9.3,
              period=period,
              arrival_probs=arrival_probs,
              offsets=offsets,
              episode_length=200,
              max_simultaneous_devices=3,
              traffic_model='aperiodic',
              channel_model='collision',
              distances=None,
              path_loss=False,
              shadowing=False,
              fast_fading=False,
              verbose=False
             )

In [4]:
nomappo = NomaPPO(env, 
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 hidden_size=128,
                 gamma=0.4,
                 K_epochs=4,
                 eps_clip=0.1,
                 prior_weight=0.6,)

In [5]:
nomappo.test(50)

0.5256325450636679

In [6]:
res = nomappo.learn(10000, update_frequency=20, test_length=10)

Episode : 0, Reward : 0.995, Train scores: 0.7991967871485943, Test score : 0.5474837662337662, policy lr: 0.0001
Episode : 20, Reward : 0.99775, Train scores: 0.8134027202135364, Test score : 0.5631385369840621, policy lr: 0.0001
Episode : 40, Reward : 0.975, Train scores: 0.8188276811888896, Test score : 0.5835777126099706, policy lr: 0.0001
Episode : 60, Reward : 0.99775, Train scores: 0.8451440164190689, Test score : 0.6063059224541969, policy lr: 0.0001
Episode : 80, Reward : 1.01125, Train scores: 0.847156014797744, Test score : 0.6034691407825736, policy lr: 0.0001
Episode : 100, Reward : 1.0145, Train scores: 0.8462250357950735, Test score : 0.653416149068323, policy lr: 0.0001
Episode : 120, Reward : 1.03575, Train scores: 0.8756571531806833, Test score : 0.6761433868974043, policy lr: 0.0001
Episode : 140, Reward : 1.0275, Train scores: 0.8763097430035918, Test score : 0.6794975688816856, policy lr: 0.0001
Episode : 160, Reward : 1.027, Train scores: 0.8775430810984727, Test 

Episode : 1400, Reward : 1.1455, Train scores: 0.9554792388973737, Test score : 0.9541359541359541, policy lr: 0.0001
Episode : 1420, Reward : 1.09275, Train scores: 0.9549524518634831, Test score : 0.9688498402555911, policy lr: 0.0001
Episode : 1440, Reward : 1.13875, Train scores: 0.9571547427962482, Test score : 0.9757966922146026, policy lr: 0.0001
Episode : 1460, Reward : 1.13625, Train scores: 0.9638995033486963, Test score : 0.9181708784596871, policy lr: 0.0001
Episode : 1480, Reward : 1.1085, Train scores: 0.9602565970193787, Test score : 0.9496433067561897, policy lr: 0.0001
Episode : 1500, Reward : 1.141, Train scores: 0.9656534356513268, Test score : 0.9637243047158404, policy lr: 0.0001
Episode : 1520, Reward : 1.112, Train scores: 0.9578598635428024, Test score : 0.9201171058134672, policy lr: 0.0001
Episode : 1540, Reward : 1.10625, Train scores: 0.9664336376449434, Test score : 0.9460016488046167, policy lr: 0.0001
Episode : 1560, Reward : 1.13025, Train scores: 0.9682

Episode : 2800, Reward : 1.12075, Train scores: 0.9572177534483488, Test score : 0.9718426501035197, policy lr: 0.0001
Episode : 2820, Reward : 1.134, Train scores: 0.9596890189727137, Test score : 0.9709090909090909, policy lr: 0.0001
Episode : 2840, Reward : 1.13775, Train scores: 0.9606452902957455, Test score : 0.9592424866200082, policy lr: 0.0001
Episode : 2860, Reward : 1.1445, Train scores: 0.9625328497928247, Test score : 0.9614767255216693, policy lr: 0.0001
Episode : 2880, Reward : 1.11975, Train scores: 0.957421543573477, Test score : 0.9421182266009852, policy lr: 0.0001
Episode : 2900, Reward : 1.1395, Train scores: 0.9586490078198672, Test score : 0.9656274980015987, policy lr: 0.0001
Episode : 2920, Reward : 1.1455, Train scores: 0.9711014175273167, Test score : 0.9606741573033708, policy lr: 0.0001
Episode : 2940, Reward : 1.12525, Train scores: 0.9625341111992087, Test score : 0.956997971602434, policy lr: 0.0001
Episode : 2960, Reward : 1.164, Train scores: 0.9713514

Episode : 4200, Reward : 1.13975, Train scores: 0.9693798233969094, Test score : 0.9535076795350768, policy lr: 0.0001
Episode : 4220, Reward : 1.13475, Train scores: 0.9711794669590171, Test score : 0.9783761729906161, policy lr: 0.0001
Episode : 4240, Reward : 1.1425, Train scores: 0.9671331991294961, Test score : 0.96347757046447, policy lr: 0.0001
Episode : 4260, Reward : 1.136, Train scores: 0.9715339606159672, Test score : 0.9931754315535929, policy lr: 0.0001
Episode : 4280, Reward : 1.1305, Train scores: 0.9716622973587905, Test score : 0.9770491803278688, policy lr: 0.0001
Episode : 4300, Reward : 1.153, Train scores: 0.9571831181877439, Test score : 0.9808631921824105, policy lr: 0.0001
Episode : 4320, Reward : 1.15425, Train scores: 0.96677268309556, Test score : 0.9746572496884088, policy lr: 0.0001
Episode : 4340, Reward : 1.14525, Train scores: 0.964558512122809, Test score : 0.9739625711960944, policy lr: 0.0001
Episode : 4360, Reward : 1.14525, Train scores: 0.967091380

In [7]:
nomappo.test(100)

0.9810219573912333

In [9]:
nomappo.test(1, verbose=True)

Timestep: 0
Current state: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
obs: tensor([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.]), Prior: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Action: tensor([1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1.])
reward: 0.0


Timestep: 1
Current state: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
obs: tensor([-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
        -1.0000, -1.0000, -1.0000, -1.0000,  1.0000,  0.5000,  1.0000,  1.0000,
         0.5000,  1.0000,  0.5000,  0.5000,  0.5000,  0.

obs: tensor([-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
        -1.0000, -1.0000, -1.0000, -1.0000,  1.0000,  0.5000,  1.0000,  1.0000,
         0.5000,  1.0000,  0.5000,  1.0000,  1.0000,  1.0000,  0.5000,  1.0000,
         1.0000]), Prior: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
Action: tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.])
reward: 2.0


Timestep: 132
Current state: [[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
obs: tensor([-1.0000, -1.0000,  4.0000, -1.0000, -1.0000, -1.0000, -1.0000,  4.0000,
        -1.0000, -1.0000, -1.0000, -1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
         1.0000,  0.5000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
         2.0000]), Prior: tensor([0.6000, 0.6000, 1.0000, 0.6000, 0.6000, 0.6000, 0.6000, 1.0000, 0.60

0.9529411764705882