# TL DQN - Training phase
* shared network: 32, 32, 15

### import required packages

In [1]:
##################### import required packages ##################### 


import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os

import sys
if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.slicing_env import SlicingEnvironment
from lib.agents import dqn #qlearning
from lib import utils

matplotlib.style.use('ggplot')


##################### configure the simulation ##################### 
# set a random seed for reproducibility
np.random.seed(2023)

# number of DRL agent timesteps per episode 
max_episode_timesteps = 100

total_data_episodes = 1

# number of DRL agent episodes (for the sake of better results visulization)
total_episodes = 50

# dqn or mtdqn or mdqn
agent_name = 'dqn'

learning_type = 'non_accelerated'

# sigmoid reward function configurations
c1_volte = 0.5
c2_volte = 10
c1_urllc = 2
c2_urllc = 3
c1_video = 1
c2_video = 7

# q-learning agent configurations
exploration = 'e_greedy' # 'e_greedy' or 'boltzmann'
discount_factor=0.3
alpha=0.1
epsilon=0.9
epsilon_decay=0.9
decay_steps=50

# policy reuse flag
loaded_qnet='no'

# slicing configurations
# number of users per slice in the following order: VoLTE, Video, URLLC
num_users = [int(46/4), int(46/4), int(8/4)]

poisson_volte = np.full((1, 200), 1)
poisson_video = np.full((1, 200), 1)
poisson_urllc = np.full((1, 200), 1)

max_num_users = [max(poisson_volte[0]), max(poisson_video[0]), max(poisson_urllc[0])]

num_users_poisson = [poisson_video[0], poisson_volte[0], poisson_urllc[0]]

max_size_per_tti = 40
max_num_packets = 0
max_traffic_percentage = 1
num_action_lvls = 15
num_slices = 3
sl_win_size = 40
time_quantum = 1
max_trans_per_tti = 6

##################### generate sample traffic data #####################

traffic_df = utils.generate_data(max_num_users[0], max_num_users[1], 
                                 max_num_users[2], sl_win_size*max_episode_timesteps)
traffic_df = traffic_df.reset_index(drop=True)

## Training phase

In [2]:

for i in range(0,16):
    
    # set the weights of the reward function
    w_volte = utils.get_reward_weights(i)[0]
    w_urllc = utils.get_reward_weights(i)[1]
    w_video = utils.get_reward_weights(i)[2]

    print('i=%d: w_volte %f, w_urllc %f, w_video %f' %(i, w_volte, w_urllc, w_video))

    # initialize the OpenAI gym-compatible environment using the configured simulation parameters
    enviro = SlicingEnvironment(traffic_df, max_num_packets, max_size_per_tti, num_action_lvls, 
                         num_slices, max_episode_timesteps, sl_win_size, time_quantum,total_data_episodes,
                         num_users_poisson, max_traffic_percentage, max_trans_per_tti, w_volte, w_urllc,
                            w_video, c1_volte, c1_urllc, c1_video, c2_volte, c2_urllc, c2_video)

    env = enviro

    ##################### train the basic expert agents from scratch given the configured reward function weights
    qnet, stats = dqn.dqn(env,num_episodes=total_episodes,
                          exploration=exploration,
                          gamma=discount_factor,
                          lr=alpha,
                          epsilon=epsilon,
                          epsilon_decay=epsilon_decay,
                          decay_steps=decay_steps,
                          loaded_qnet='no')
    
    # log the trained agents' data
    dictionary = {'config': {'generic': {'max_episode_timesteps': max_episode_timesteps, 'total_episodes': total_episodes,
                         'agent_name': agent_name, 'max_size_per_tti': max_size_per_tti,
                         'max_traffic_percentage': max_traffic_percentage, 'num_action_lvls': num_action_lvls,
                         'num_slices': num_slices, 'sl_win_size': sl_win_size, 'max_trans_per_tti': max_trans_per_tti,
                         'w_volte': w_volte, 'w_urllc': w_urllc, 'w_video': w_video, 'c1_volte': c1_volte,
                         'c2_volte': c2_volte, 'c1_urllc': c1_urllc, 'c2_urllc': c2_urllc,
                         'c1_video': c1_video, 'c2_video': c2_video, 'learning_type': learning_type},
                         'agent_specific': {'discount_factor': discount_factor, 'alpha': alpha,
                                            'epsilon': epsilon, 'epsilon_decay': epsilon_decay,
                                            'decay_steps': decay_steps, 'loaded_qnet': loaded_qnet}
                        },
              'rewards': {'steps': env.step_rewards, 'episodes': list(stats[1])},
              'KPIs': {'delay': env.total_avg_waiting_times,
                       'throughput': env.total_throughputs, 'finished_throughput': env.finished_throughputs,
                       'remaining_sizes_sum': env.remaining_sizes_sum, 'remaining_sizes': env.remaining_sizes,
                       'remaining_times_sum': env.remaining_times_sum, 'remaining_times': env.remaining_times,
                       'total_p_numbers': env.total_p_numbers, 'done_p_numbers': env.done_p_numbers
                     }}
    
    # save training data to file
    path = 'saved_models/base/'
    if not os.path.exists(path):
        os.makedirs(path)

    net_para_file = path + 'net.params_' + \
                    str(exploration) +'_'+ \
                    str(learning_type) + '_' + \
                    str(agent_name) + '_'  +  \
                    str(int(w_volte*100)) +  str(int(w_urllc*100)) + str(int(w_video*100))
    
    file_name = path + str(learning_type) + '_' +\
                str(agent_name) + '_' + str(int(w_volte*100)) + \
                str(int(w_urllc*100)) + str(int(w_video*100)) + '_ep.npy'

    qnet.save_parameters(net_para_file)
#     np.save(file_name, dictionary)

i=0: w_volte 0.100000, w_urllc 0.800000, w_video 0.100000
new qnet
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 33,egreedy 0.810000
eps 1,reward 36,egreedy 0.729000
eps 2,reward 35,egreedy 0.656100
eps 3,reward 37,egreedy 0.590490
eps 4,reward 43,egreedy 0.531441
eps 5,reward 52,egreedy 0.478297
eps 6,reward 53,egreedy 0.430467
eps 7,reward 58,egreedy 0.387420
eps 8,reward 57,egreedy 0.348678
target network parameters replaced
eps 9,reward 62,egreedy 0.313811
eps 10,reward 60,egreedy 0.282430
eps 11,reward 60,egreedy 0.254187
eps 12,reward 59,egreedy 0.228768
eps 13,reward 64,egreedy 0.205891
eps 14,reward 65,egreedy 0.185302
eps 15,reward 63,egreedy 0.166772
eps 16,reward 68,egreedy 0.150095
eps 17,reward 67,egreedy 0.135085
eps 18,reward 72,egreedy 0.121577
target network parameters replaced
eps 19,reward 73,egreedy 0.109419
eps 20,reward 74,egre

eps 45,reward 63,egreedy 0.007070
eps 46,reward 64,egreedy 0.006363
eps 47,reward 62,egreedy 0.005726
eps 48,reward 62,egreedy 0.005154
target network parameters replaced
eps 49,reward 64,egreedy 0.004638
i=4: w_volte 0.100000, w_urllc 0.200000, w_video 0.700000
new qnet
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 28,egreedy 0.810000
eps 1,reward 26,egreedy 0.729000
eps 2,reward 29,egreedy 0.656100
eps 3,reward 27,egreedy 0.590490
eps 4,reward 24,egreedy 0.531441
eps 5,reward 50,egreedy 0.478297
eps 6,reward 53,egreedy 0.430467
eps 7,reward 53,egreedy 0.387420
eps 8,reward 55,egreedy 0.348678
target network parameters replaced
eps 9,reward 59,egreedy 0.313811
eps 10,reward 58,egreedy 0.282430
eps 11,reward 62,egreedy 0.254187
eps 12,reward 58,egreedy 0.228768
eps 13,reward 66,egreedy 0.205891
eps 14,reward 63,egreedy 0.185302
eps 15,reward 59,egre

eps 40,reward 49,egreedy 0.011973
eps 41,reward 48,egreedy 0.010775
eps 42,reward 46,egreedy 0.009698
eps 43,reward 49,egreedy 0.008728
eps 44,reward 49,egreedy 0.007855
eps 45,reward 49,egreedy 0.007070
eps 46,reward 45,egreedy 0.006363
eps 47,reward 51,egreedy 0.005726
eps 48,reward 46,egreedy 0.005154
target network parameters replaced
eps 49,reward 50,egreedy 0.004638
i=8: w_volte 0.100000, w_urllc 0.400000, w_video 0.500000
new qnet
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 29,egreedy 0.810000
eps 1,reward 29,egreedy 0.729000
eps 2,reward 31,egreedy 0.656100
eps 3,reward 30,egreedy 0.590490
eps 4,reward 30,egreedy 0.531441
eps 5,reward 41,egreedy 0.478297
eps 6,reward 45,egreedy 0.430467
eps 7,reward 45,egreedy 0.387420
eps 8,reward 46,egreedy 0.348678
target network parameters replaced
eps 9,reward 47,egreedy 0.313811
eps 10,reward 48,egre

eps 35,reward 62,egreedy 0.020276
eps 36,reward 59,egreedy 0.018248
eps 37,reward 67,egreedy 0.016423
eps 38,reward 66,egreedy 0.014781
target network parameters replaced
eps 39,reward 67,egreedy 0.013303
eps 40,reward 66,egreedy 0.011973
eps 41,reward 64,egreedy 0.010775
eps 42,reward 60,egreedy 0.009698
eps 43,reward 65,egreedy 0.008728
eps 44,reward 65,egreedy 0.007855
eps 45,reward 66,egreedy 0.007070
eps 46,reward 64,egreedy 0.006363
eps 47,reward 58,egreedy 0.005726
eps 48,reward 63,egreedy 0.005154
target network parameters replaced
eps 49,reward 67,egreedy 0.004638
i=12: w_volte 0.400000, w_urllc 0.400000, w_video 0.200000
new qnet
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 46,egreedy 0.810000
eps 1,reward 47,egreedy 0.729000
eps 2,reward 48,egreedy 0.656100
eps 3,reward 51,egreedy 0.590490
eps 4,reward 53,egreedy 0.531441
eps 5,reward 50

eps 30,reward 64,egreedy 0.034337
eps 31,reward 64,egreedy 0.030903
eps 32,reward 64,egreedy 0.027813
eps 33,reward 60,egreedy 0.025032
eps 34,reward 63,egreedy 0.022528
eps 35,reward 62,egreedy 0.020276
eps 36,reward 61,egreedy 0.018248
eps 37,reward 61,egreedy 0.016423
eps 38,reward 63,egreedy 0.014781
target network parameters replaced
eps 39,reward 63,egreedy 0.013303
eps 40,reward 62,egreedy 0.011973
eps 41,reward 63,egreedy 0.010775
eps 42,reward 60,egreedy 0.009698
eps 43,reward 63,egreedy 0.008728
eps 44,reward 64,egreedy 0.007855
eps 45,reward 62,egreedy 0.007070
eps 46,reward 62,egreedy 0.006363
eps 47,reward 60,egreedy 0.005726
eps 48,reward 63,egreedy 0.005154
target network parameters replaced
eps 49,reward 62,egreedy 0.004638
