# Meta DQN
* meta learning rate (metalr) = 0.5
* task-specific network: 32, 32, 15
* model = 444444


* learner alpha=0.01

### import required packages

In [1]:
##################### import required packages #####################


import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os

import sys
if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.slicing_env import SlicingEnvironment
from lib.agents import medqn 
from lib import utils

matplotlib.style.use('ggplot')

##################### configure the simulation #####################

# set a random seed for reproducibility
np.random.seed(2023)

# number of DRL agent timesteps per episode 
max_episode_timesteps = 100

total_data_episodes = 1

# number of DRL agent episodes (for the sake of better results visulization)
total_episodes = 50

# qlearning or sarsa1
agent_name = 'medqn'

learning_type = 'accelerated'

# sigmoid reward function configurations
c1_volte = 0.5
c2_volte = 10
c1_urllc = 2
c2_urllc = 3
c1_video = 1
c2_video = 7

# q-learning agent configurations
exploration = 'e_greedy' # 'e_greedy' or 'boltzmann'
discount_factor=0.3
alpha=0.1
epsilon=0.9
epsilon_decay=0.9
decay_steps=50

# policy reuse flag
# loaded_qnet='no'

# slicing configurations
# number of users per slice in the following order: VoLTE, Video, URLLC
num_users = [int(46/4), int(46/4), int(8/4)]

poisson_volte = np.full((1, 200), 1)
poisson_video = np.full((1, 200), 1)
poisson_urllc = np.full((1, 200), 1)

max_num_users = [max(poisson_volte[0]), max(poisson_video[0]), max(poisson_urllc[0])]

num_users_poisson = [poisson_video[0], poisson_volte[0], poisson_urllc[0]]

max_size_per_tti = 40
max_num_packets = 0
max_traffic_percentage = 1
num_action_lvls = 15
num_slices = 3
sl_win_size = 40
time_quantum = 1
max_trans_per_tti = 6

### generate sample traffic data

In [2]:
traffic_df = utils.generate_data(max_num_users[0], max_num_users[1], 
                                 max_num_users[2], sl_win_size*max_episode_timesteps)
traffic_df = traffic_df.reset_index(drop=True)

## Training phase

In [None]:
initial_run = True

for i in range(0,16):
    # set the weights of the reward function
    w_volte = utils.get_reward_weights(i)[0]
    w_urllc = utils.get_reward_weights(i)[1]
    w_video = utils.get_reward_weights(i)[2]

    print('i=%d: w_volte %f, w_urllc %f, w_video %f' %(i, w_volte, w_urllc, w_video))

    # initialize the OpenAI gym-compatible environment using the configured simulation parameters
    enviro = SlicingEnvironment(traffic_df, max_num_packets, max_size_per_tti, num_action_lvls, 
                         num_slices, max_episode_timesteps, sl_win_size, time_quantum,total_data_episodes,
                         num_users_poisson, max_traffic_percentage, max_trans_per_tti, w_volte, w_urllc,
                            w_video, c1_volte, c1_urllc, c1_video, c2_volte, c2_urllc, c2_video)

    env = enviro
    
    if initial_run == True:
        ##################### train the basic expert agents from scratch given the configured reward function weights
        qnet,stats = medqn.medqn(env,
                                    num_episodes=total_episodes,
                                    exploration=exploration,
                                    gamma=discount_factor,
                                    lr=alpha,
                                    epsilon=epsilon,
                                    epsilon_decay=epsilon_decay,
                                    decay_steps=decay_steps,
                                    initial_run = initial_run,
                                    qNet=None)
        initial_run = False
        mnet_para = qnet
    else:        
        print('initial run:  %s' %(initial_run))
        qnet,stats = medqn.medqn(env,
                            num_episodes=total_episodes,
                            exploration=exploration,
                            gamma=discount_factor,
                            lr=alpha,
                            epsilon=epsilon,
                            epsilon_decay=epsilon_decay,
                            decay_steps=decay_steps,
                            initial_run = initial_run,
                            qNet = mnet_para)
        mnet_para = qnet        
        
# save training data to file
path = 'saved_models/medqn/'
if not os.path.exists(path):
    os.makedirs(path)

mnet_para_file = path + 'medqn.params_' + \
                str(exploration) +'_'+ \
                str(learning_type) + '_' + \
                str(agent_name)
mnet_para.save_parameters(mnet_para_file)

## Adaptation phase

In [3]:
from mxnet import gluon

def build_net(env, net):
    with net.name_scope():
        net.add(gluon.nn.Dense(32, activation='sigmoid'))
        net.add(gluon.nn.Dense(32, activation='sigmoid'))
        net.add(gluon.nn.Dense(env.action_space.n))
    return net

In [4]:
# base_path = os.getcwd()+'/saved_models/base/'

# q-learning agent configurations
discount_factor=0.3
alpha=0.01 
epsilon=0.1 
epsilon_decay=0.5 
decay_steps=10 
exploration = 'e_greedy'
initial_run = False

order = 0
for i in range(0,10):
    # set the weights of the learner agent's reward function
    w_volte = utils.get_reward_weights_acc(i, order)[0]
    w_urllc = utils.get_reward_weights_acc(i, order)[1]
    w_video = utils.get_reward_weights_acc(i, order)[2]

    print('i=%d: w_volte %f, w_urllc %f, w_video %f' %(i, w_volte, w_urllc, w_video))

    by_w_volte = 0.22
    by_w_urllc = 0.22
    by_w_video = 0.22

    # initialize the OpenAI gym-compatible environment using the configured simulation parameters
    enviro = SlicingEnvironment(traffic_df, max_num_packets, max_size_per_tti, num_action_lvls, 
                         num_slices, max_episode_timesteps, sl_win_size, time_quantum,total_data_episodes,
                         num_users_poisson, max_traffic_percentage, max_trans_per_tti, w_volte, w_urllc,
                            w_video, c1_volte, c1_urllc, c1_video, c2_volte, c2_urllc, c2_video)

    env = enviro

    mnet_para = build_net(env, gluon.nn.Sequential())
    path = 'saved_models/medqn/'
    mnet_para_file = path + 'medqn.params_' + \
                    str(exploration) +'_'+ \
                    str(learning_type) + '_' + \
                    str(agent_name)
    mnet_para.load_parameters(mnet_para_file)

    qnet,stats = medqn.medqn(env,
                        num_episodes=total_episodes,
                        exploration=exploration,
                        gamma=discount_factor,
                        lr=alpha,
                        epsilon=epsilon,
                        epsilon_decay=epsilon_decay,
                        decay_steps=decay_steps,
                        initial_run = False,
                        qNet = mnet_para)

    # log the trained agents' data
    dictionary = {'config': {'generic': {'max_episode_timesteps': max_episode_timesteps, 'total_episodes': total_episodes,
                         'agent_name': agent_name, 'max_size_per_tti': max_size_per_tti,
                         'max_traffic_percentage': max_traffic_percentage, 'num_action_lvls': num_action_lvls,
                         'num_slices': num_slices, 'sl_win_size': sl_win_size, 'max_trans_per_tti': max_trans_per_tti,
                         'w_volte': w_volte, 'w_urllc': w_urllc, 'w_video': w_video, 'by_w_volte': by_w_volte, 
                         'by_w_urllc': by_w_urllc, 'by_w_video': by_w_video,
                         'c1_volte': c1_volte,'c2_volte': c2_volte, 'c1_urllc': c1_urllc, 'c2_urllc': c2_urllc,
                         'c1_video': c1_video, 'c2_video': c2_video,
                         'learning_type': learning_type},
                         'agent_specific': {'discount_factor': discount_factor, 'alpha': alpha,
                                            'epsilon': epsilon, 'epsilon_decay': epsilon_decay,
                                            'decay_steps': decay_steps}
                        },
              'rewards': {'steps': env.step_rewards, 'episodes': list(stats[1])},
              'KPIs': {'delay': env.total_avg_waiting_times,
                       'throughput': env.total_throughputs, 'finished_throughput': env.finished_throughputs,
                       'remaining_sizes_sum': env.remaining_sizes_sum, 'remaining_sizes': env.remaining_sizes,
                       'remaining_times_sum': env.remaining_times_sum, 'remaining_times': env.remaining_times,
                       'total_p_numbers': env.total_p_numbers, 'done_p_numbers': env.done_p_numbers
                     }}
    # save training data to file
    path = 'saved_models/me_accelerated/'
    if not os.path.exists(path):
        # create a new directory because it does not exist 
        os.makedirs(path)
    file_name = path + str(learning_type) + '_' + str(agent_name) + '_' + \
                str(int(w_volte*100)) + str(int(w_urllc*100)) + str(int(w_video*100)) + '_by_' + \
                str(int(by_w_volte*100)) + str(int(by_w_urllc*100)) + str(int(by_w_video*100)) + '_ep.npy'
    np.save(file_name, dictionary)

i=0: w_volte 0.500000, w_urllc 0.400000, w_video 0.100000
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 47,egreedy 0.050000
eps 1,reward 47,egreedy 0.025000
eps 2,reward 46,egreedy 0.012500
eps 3,reward 46,egreedy 0.006250
eps 4,reward 46,egreedy 0.003125
eps 5,reward 65,egreedy 0.001563
eps 6,reward 70,egreedy 0.000781
eps 7,reward 71,egreedy 0.000391
eps 8,reward 71,egreedy 0.000195
target network parameters replaced
eps 9,reward 71,egreedy 0.000098
eps 10,reward 71,egreedy 0.000098
eps 11,reward 71,egreedy 0.000098
eps 12,reward 71,egreedy 0.000098
eps 13,reward 71,egreedy 0.000098
eps 14,reward 71,egreedy 0.000098
eps 15,reward 71,egreedy 0.000098
eps 16,reward 71,egreedy 0.000098
eps 17,reward 71,egreedy 0.000098
eps 18,reward 71,egreedy 0.000098
target network parameters replaced
eps 19,reward 71,egreedy 0.000098
eps 20,reward 71,egreedy 0.000

eps 46,reward 83,egreedy 0.000098
eps 47,reward 84,egreedy 0.000098
eps 48,reward 85,egreedy 0.000098
target network parameters replaced
eps 49,reward 85,egreedy 0.000098
i=4: w_volte 0.800000, w_urllc 0.150000, w_video 0.050000
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 31,egreedy 0.050000
eps 1,reward 32,egreedy 0.025000
eps 2,reward 29,egreedy 0.012500
eps 3,reward 28,egreedy 0.006250
eps 4,reward 28,egreedy 0.003125
eps 5,reward 78,egreedy 0.001563
eps 6,reward 84,egreedy 0.000781
eps 7,reward 85,egreedy 0.000391
eps 8,reward 85,egreedy 0.000195
target network parameters replaced
eps 9,reward 85,egreedy 0.000098
eps 10,reward 85,egreedy 0.000098
eps 11,reward 85,egreedy 0.000098
eps 12,reward 85,egreedy 0.000098
eps 13,reward 85,egreedy 0.000098
eps 14,reward 85,egreedy 0.000098
eps 15,reward 85,egreedy 0.000098
eps 16,reward 85,egreedy 0.000

eps 41,reward 67,egreedy 0.000098
eps 42,reward 67,egreedy 0.000098
eps 43,reward 67,egreedy 0.000098
eps 44,reward 67,egreedy 0.000098
eps 45,reward 67,egreedy 0.000098
eps 46,reward 67,egreedy 0.000098
eps 47,reward 67,egreedy 0.000098
eps 48,reward 67,egreedy 0.000098
target network parameters replaced
eps 49,reward 67,egreedy 0.000098
i=8: w_volte 0.400000, w_urllc 0.350000, w_video 0.250000
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 40,egreedy 0.050000
eps 1,reward 40,egreedy 0.025000
eps 2,reward 40,egreedy 0.012500
eps 3,reward 40,egreedy 0.006250
eps 4,reward 40,egreedy 0.003125
eps 5,reward 57,egreedy 0.001563
eps 6,reward 60,egreedy 0.000781
eps 7,reward 60,egreedy 0.000391
eps 8,reward 61,egreedy 0.000195
target network parameters replaced
eps 9,reward 60,egreedy 0.000098
eps 10,reward 61,egreedy 0.000098
eps 11,reward 61,egreedy 0.000

eps 37,reward 55,egreedy 0.000098
eps 38,reward 55,egreedy 0.000098
target network parameters replaced
eps 39,reward 55,egreedy 0.000098
eps 40,reward 55,egreedy 0.000098
eps 41,reward 55,egreedy 0.000098
eps 42,reward 55,egreedy 0.000098
eps 43,reward 55,egreedy 0.000098
eps 44,reward 55,egreedy 0.000098
eps 45,reward 55,egreedy 0.000098
eps 46,reward 55,egreedy 0.000098
eps 47,reward 55,egreedy 0.000098
eps 48,reward 55,egreedy 0.000098
target network parameters replaced
eps 49,reward 55,egreedy 0.000098
i=2: w_volte 0.400000, w_urllc 0.250000, w_video 0.350000
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 32,egreedy 0.050000
eps 1,reward 32,egreedy 0.025000
eps 2,reward 31,egreedy 0.012500
eps 3,reward 30,egreedy 0.006250
eps 4,reward 30,egreedy 0.003125
eps 5,reward 63,egreedy 0.001563
eps 6,reward 69,egreedy 0.000781
eps 7,reward 69,egreedy 0.0

eps 32,reward 72,egreedy 0.000098
eps 33,reward 72,egreedy 0.000098
eps 34,reward 72,egreedy 0.000098
eps 35,reward 72,egreedy 0.000098
eps 36,reward 72,egreedy 0.000098
eps 37,reward 72,egreedy 0.000098
eps 38,reward 72,egreedy 0.000098
target network parameters replaced
eps 39,reward 72,egreedy 0.000098
eps 40,reward 72,egreedy 0.000098
eps 41,reward 72,egreedy 0.000098
eps 42,reward 72,egreedy 0.000098
eps 43,reward 72,egreedy 0.000098
eps 44,reward 72,egreedy 0.000098
eps 45,reward 72,egreedy 0.000098
eps 46,reward 63,egreedy 0.000098
eps 47,reward 72,egreedy 0.000098
eps 48,reward 72,egreedy 0.000098
target network parameters replaced
eps 49,reward 72,egreedy 0.000098
i=6: w_volte 0.200000, w_urllc 0.150000, w_video 0.650000
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz):500
batch_size(bs):32
Target_update(tu):1000
------------------------------------
eps 0,reward 20,egreedy 0.050000
eps 1,reward 18,egreedy 0.025000
eps 2,reward 18,egreed

eps 28,reward 82,egreedy 0.000098
target network parameters replaced
eps 29,reward 82,egreedy 0.000098
eps 30,reward 82,egreedy 0.000098
eps 31,reward 82,egreedy 0.000098
eps 32,reward 82,egreedy 0.000098
eps 33,reward 82,egreedy 0.000098
eps 34,reward 82,egreedy 0.000098
eps 35,reward 82,egreedy 0.000098
eps 36,reward 82,egreedy 0.000098
eps 37,reward 82,egreedy 0.000098
eps 38,reward 82,egreedy 0.000098
target network parameters replaced
eps 39,reward 81,egreedy 0.000098
eps 40,reward 82,egreedy 0.000098
eps 41,reward 82,egreedy 0.000098
eps 42,reward 82,egreedy 0.000098
eps 43,reward 81,egreedy 0.000098
eps 44,reward 82,egreedy 0.000098
eps 45,reward 82,egreedy 0.000098
eps 46,reward 81,egreedy 0.000098
eps 47,reward 82,egreedy 0.000098
eps 48,reward 82,egreedy 0.000098
target network parameters replaced
eps 49,reward 81,egreedy 0.000098
i=0: w_volte 0.100000, w_urllc 0.350000, w_video 0.550000
------------------------------------
replay_buffer_size(rbz):20000
replay_start_size(rsz)

eps 23,reward 68,egreedy 0.000098
eps 24,reward 68,egreedy 0.000098
eps 25,reward 68,egreedy 0.000098
eps 26,reward 68,egreedy 0.000098
eps 27,reward 68,egreedy 0.000098
eps 28,reward 68,egreedy 0.000098
target network parameters replaced
eps 29,reward 68,egreedy 0.000098
eps 30,reward 68,egreedy 0.000098
eps 31,reward 68,egreedy 0.000098
eps 32,reward 68,egreedy 0.000098
eps 33,reward 67,egreedy 0.000098
eps 34,reward 68,egreedy 0.000098
eps 35,reward 67,egreedy 0.000098
eps 36,reward 67,egreedy 0.000098
eps 37,reward 68,egreedy 0.000098
eps 38,reward 67,egreedy 0.000098
target network parameters replaced
eps 39,reward 70,egreedy 0.000098
eps 40,reward 76,egreedy 0.000098
eps 41,reward 76,egreedy 0.000098
eps 42,reward 76,egreedy 0.000098
eps 43,reward 76,egreedy 0.000098
eps 44,reward 76,egreedy 0.000098
eps 45,reward 76,egreedy 0.000098
eps 46,reward 76,egreedy 0.000098
eps 47,reward 76,egreedy 0.000098
eps 48,reward 76,egreedy 0.000098
target network parameters replaced
eps 49,rewa

target network parameters replaced
eps 19,reward 68,egreedy 0.000098
eps 20,reward 68,egreedy 0.000098
eps 21,reward 68,egreedy 0.000098
eps 22,reward 68,egreedy 0.000098
eps 23,reward 68,egreedy 0.000098
eps 24,reward 68,egreedy 0.000098
eps 25,reward 68,egreedy 0.000098
eps 26,reward 68,egreedy 0.000098
eps 27,reward 68,egreedy 0.000098
eps 28,reward 68,egreedy 0.000098
target network parameters replaced
eps 29,reward 68,egreedy 0.000098
eps 30,reward 68,egreedy 0.000098
eps 31,reward 68,egreedy 0.000098
eps 32,reward 68,egreedy 0.000098
eps 33,reward 68,egreedy 0.000098
eps 34,reward 68,egreedy 0.000098
eps 35,reward 68,egreedy 0.000098
eps 36,reward 68,egreedy 0.000098
eps 37,reward 68,egreedy 0.000098
eps 38,reward 68,egreedy 0.000098
target network parameters replaced
eps 39,reward 68,egreedy 0.000098
eps 40,reward 68,egreedy 0.000098
eps 41,reward 68,egreedy 0.000098
eps 42,reward 66,egreedy 0.000098
eps 43,reward 68,egreedy 0.000098
eps 44,reward 68,egreedy 0.000098
eps 45,rewa

eps 14,reward 53,egreedy 0.000098
eps 15,reward 53,egreedy 0.000098
eps 16,reward 53,egreedy 0.000098
eps 17,reward 53,egreedy 0.000098
eps 18,reward 53,egreedy 0.000098
target network parameters replaced
eps 19,reward 53,egreedy 0.000098
eps 20,reward 53,egreedy 0.000098
eps 21,reward 53,egreedy 0.000098
eps 22,reward 53,egreedy 0.000098
eps 23,reward 53,egreedy 0.000098
eps 24,reward 53,egreedy 0.000098
eps 25,reward 53,egreedy 0.000098
eps 26,reward 53,egreedy 0.000098
eps 27,reward 53,egreedy 0.000098
eps 28,reward 53,egreedy 0.000098
target network parameters replaced
eps 29,reward 53,egreedy 0.000098
eps 30,reward 53,egreedy 0.000098
eps 31,reward 53,egreedy 0.000098
eps 32,reward 53,egreedy 0.000098
eps 33,reward 53,egreedy 0.000098
eps 34,reward 53,egreedy 0.000098
eps 35,reward 53,egreedy 0.000098
eps 36,reward 52,egreedy 0.000098
eps 37,reward 52,egreedy 0.000098
eps 38,reward 53,egreedy 0.000098
target network parameters replaced
eps 39,reward 53,egreedy 0.000098
eps 40,rewa

eps 10,reward 79,egreedy 0.000098
eps 11,reward 79,egreedy 0.000098
eps 12,reward 79,egreedy 0.000098
eps 13,reward 79,egreedy 0.000098
eps 14,reward 79,egreedy 0.000098
eps 15,reward 79,egreedy 0.000098
eps 16,reward 79,egreedy 0.000098
eps 17,reward 79,egreedy 0.000098
eps 18,reward 79,egreedy 0.000098
target network parameters replaced
eps 19,reward 79,egreedy 0.000098
eps 20,reward 79,egreedy 0.000098
eps 21,reward 79,egreedy 0.000098
eps 22,reward 79,egreedy 0.000098
eps 23,reward 79,egreedy 0.000098
eps 24,reward 79,egreedy 0.000098
eps 25,reward 79,egreedy 0.000098
eps 26,reward 79,egreedy 0.000098
eps 27,reward 79,egreedy 0.000098
eps 28,reward 79,egreedy 0.000098
target network parameters replaced
eps 29,reward 79,egreedy 0.000098
eps 30,reward 79,egreedy 0.000098
eps 31,reward 79,egreedy 0.000098
eps 32,reward 79,egreedy 0.000098
eps 33,reward 79,egreedy 0.000098
eps 34,reward 79,egreedy 0.000098
eps 35,reward 79,egreedy 0.000098
eps 36,reward 79,egreedy 0.000098
eps 37,rewar

eps 6,reward 74,egreedy 0.000781
eps 7,reward 77,egreedy 0.000391
eps 8,reward 76,egreedy 0.000195
target network parameters replaced
eps 9,reward 77,egreedy 0.000098
eps 10,reward 76,egreedy 0.000098
eps 11,reward 77,egreedy 0.000098
eps 12,reward 77,egreedy 0.000098
eps 13,reward 76,egreedy 0.000098
eps 14,reward 76,egreedy 0.000098
eps 15,reward 76,egreedy 0.000098
eps 16,reward 76,egreedy 0.000098
eps 17,reward 76,egreedy 0.000098
eps 18,reward 77,egreedy 0.000098
target network parameters replaced
eps 19,reward 77,egreedy 0.000098
eps 20,reward 77,egreedy 0.000098
eps 21,reward 77,egreedy 0.000098
eps 22,reward 76,egreedy 0.000098
eps 23,reward 76,egreedy 0.000098
eps 24,reward 76,egreedy 0.000098
eps 25,reward 76,egreedy 0.000098
eps 26,reward 77,egreedy 0.000098
eps 27,reward 76,egreedy 0.000098
eps 28,reward 76,egreedy 0.000098
target network parameters replaced
eps 29,reward 76,egreedy 0.000098
eps 30,reward 76,egreedy 0.000098
eps 31,reward 76,egreedy 0.000098
eps 32,reward 7

eps 1,reward 44,egreedy 0.025000
eps 2,reward 43,egreedy 0.012500
eps 3,reward 44,egreedy 0.006250
eps 4,reward 44,egreedy 0.003125
eps 5,reward 55,egreedy 0.001563
eps 6,reward 57,egreedy 0.000781
eps 7,reward 57,egreedy 0.000391
eps 8,reward 56,egreedy 0.000195
target network parameters replaced
eps 9,reward 57,egreedy 0.000098
eps 10,reward 57,egreedy 0.000098
eps 11,reward 57,egreedy 0.000098
eps 12,reward 57,egreedy 0.000098
eps 13,reward 57,egreedy 0.000098
eps 14,reward 57,egreedy 0.000098
eps 15,reward 57,egreedy 0.000098
eps 16,reward 57,egreedy 0.000098
eps 17,reward 57,egreedy 0.000098
eps 18,reward 57,egreedy 0.000098
target network parameters replaced
eps 19,reward 57,egreedy 0.000098
eps 20,reward 57,egreedy 0.000098
eps 21,reward 57,egreedy 0.000098
eps 22,reward 57,egreedy 0.000098
eps 23,reward 57,egreedy 0.000098
eps 24,reward 57,egreedy 0.000098
eps 25,reward 57,egreedy 0.000098
eps 26,reward 57,egreedy 0.000098
eps 27,reward 57,egreedy 0.000098
eps 28,reward 57,egre

eps 0,reward 46,egreedy 0.050000
eps 1,reward 46,egreedy 0.025000
eps 2,reward 45,egreedy 0.012500
eps 3,reward 46,egreedy 0.006250
eps 4,reward 46,egreedy 0.003125
eps 5,reward 62,egreedy 0.001563
eps 6,reward 67,egreedy 0.000781
eps 7,reward 66,egreedy 0.000391
eps 8,reward 67,egreedy 0.000195
target network parameters replaced
eps 9,reward 67,egreedy 0.000098
eps 10,reward 66,egreedy 0.000098
eps 11,reward 66,egreedy 0.000098
eps 12,reward 66,egreedy 0.000098
eps 13,reward 66,egreedy 0.000098
eps 14,reward 67,egreedy 0.000098
eps 15,reward 66,egreedy 0.000098
eps 16,reward 66,egreedy 0.000098
eps 17,reward 66,egreedy 0.000098
eps 18,reward 65,egreedy 0.000098
target network parameters replaced
eps 19,reward 66,egreedy 0.000098
eps 20,reward 67,egreedy 0.000098
eps 21,reward 67,egreedy 0.000098
eps 22,reward 67,egreedy 0.000098
eps 23,reward 67,egreedy 0.000098
eps 24,reward 67,egreedy 0.000098
eps 25,reward 67,egreedy 0.000098
eps 26,reward 67,egreedy 0.000098
eps 27,reward 67,egree

eps 0,reward 67,egreedy 0.050000
eps 1,reward 68,egreedy 0.025000
eps 2,reward 69,egreedy 0.012500
eps 3,reward 70,egreedy 0.006250
eps 4,reward 70,egreedy 0.003125
eps 5,reward 69,egreedy 0.001563
eps 6,reward 67,egreedy 0.000781
eps 7,reward 62,egreedy 0.000391
eps 8,reward 69,egreedy 0.000195
target network parameters replaced
eps 9,reward 70,egreedy 0.000098
eps 10,reward 70,egreedy 0.000098
eps 11,reward 70,egreedy 0.000098
eps 12,reward 70,egreedy 0.000098
eps 13,reward 69,egreedy 0.000098
eps 14,reward 70,egreedy 0.000098
eps 15,reward 70,egreedy 0.000098
eps 16,reward 70,egreedy 0.000098
eps 17,reward 70,egreedy 0.000098
eps 18,reward 70,egreedy 0.000098
target network parameters replaced
eps 19,reward 70,egreedy 0.000098
eps 20,reward 70,egreedy 0.000098
eps 21,reward 70,egreedy 0.000098
eps 22,reward 70,egreedy 0.000098
eps 23,reward 69,egreedy 0.000098
eps 24,reward 70,egreedy 0.000098
eps 25,reward 70,egreedy 0.000098
eps 26,reward 70,egreedy 0.000098
eps 27,reward 70,egree

eps 0,reward 57,egreedy 0.050000
eps 1,reward 57,egreedy 0.025000
eps 2,reward 58,egreedy 0.012500
eps 3,reward 59,egreedy 0.006250
eps 4,reward 59,egreedy 0.003125
eps 5,reward 50,egreedy 0.001563
eps 6,reward 59,egreedy 0.000781
eps 7,reward 53,egreedy 0.000391
eps 8,reward 59,egreedy 0.000195
target network parameters replaced
eps 9,reward 57,egreedy 0.000098
eps 10,reward 59,egreedy 0.000098
eps 11,reward 59,egreedy 0.000098
eps 12,reward 59,egreedy 0.000098
eps 13,reward 59,egreedy 0.000098
eps 14,reward 59,egreedy 0.000098
eps 15,reward 59,egreedy 0.000098
eps 16,reward 59,egreedy 0.000098
eps 17,reward 59,egreedy 0.000098
eps 18,reward 59,egreedy 0.000098
target network parameters replaced
eps 19,reward 59,egreedy 0.000098
eps 20,reward 59,egreedy 0.000098
eps 21,reward 59,egreedy 0.000098
eps 22,reward 59,egreedy 0.000098
eps 23,reward 59,egreedy 0.000098
eps 24,reward 58,egreedy 0.000098
eps 25,reward 59,egreedy 0.000098
eps 26,reward 59,egreedy 0.000098
eps 27,reward 58,egree

eps 0,reward 24,egreedy 0.050000
eps 1,reward 24,egreedy 0.025000
eps 2,reward 22,egreedy 0.012500
eps 3,reward 21,egreedy 0.006250
eps 4,reward 21,egreedy 0.003125
eps 5,reward 66,egreedy 0.001563
eps 6,reward 85,egreedy 0.000781
eps 7,reward 85,egreedy 0.000391
eps 8,reward 85,egreedy 0.000195
target network parameters replaced
eps 9,reward 85,egreedy 0.000098
eps 10,reward 85,egreedy 0.000098
eps 11,reward 85,egreedy 0.000098
eps 12,reward 85,egreedy 0.000098
eps 13,reward 85,egreedy 0.000098
eps 14,reward 85,egreedy 0.000098
eps 15,reward 85,egreedy 0.000098
eps 16,reward 85,egreedy 0.000098
eps 17,reward 85,egreedy 0.000098
eps 18,reward 85,egreedy 0.000098
target network parameters replaced
eps 19,reward 85,egreedy 0.000098
eps 20,reward 85,egreedy 0.000098
eps 21,reward 85,egreedy 0.000098
eps 22,reward 85,egreedy 0.000098
eps 23,reward 85,egreedy 0.000098
eps 24,reward 85,egreedy 0.000098
eps 25,reward 85,egreedy 0.000098
eps 26,reward 85,egreedy 0.000098
eps 27,reward 85,egree

eps 0,reward 30,egreedy 0.050000
eps 1,reward 31,egreedy 0.025000
eps 2,reward 29,egreedy 0.012500
eps 3,reward 28,egreedy 0.006250
eps 4,reward 28,egreedy 0.003125
eps 5,reward 62,egreedy 0.001563
eps 6,reward 65,egreedy 0.000781
eps 7,reward 67,egreedy 0.000391
eps 8,reward 67,egreedy 0.000195
target network parameters replaced
eps 9,reward 66,egreedy 0.000098
eps 10,reward 66,egreedy 0.000098
eps 11,reward 66,egreedy 0.000098
eps 12,reward 66,egreedy 0.000098
eps 13,reward 67,egreedy 0.000098
eps 14,reward 66,egreedy 0.000098
eps 15,reward 66,egreedy 0.000098
eps 16,reward 66,egreedy 0.000098
eps 17,reward 66,egreedy 0.000098
eps 18,reward 66,egreedy 0.000098
target network parameters replaced
eps 19,reward 66,egreedy 0.000098
eps 20,reward 67,egreedy 0.000098
eps 21,reward 66,egreedy 0.000098
eps 22,reward 66,egreedy 0.000098
eps 23,reward 66,egreedy 0.000098
eps 24,reward 66,egreedy 0.000098
eps 25,reward 67,egreedy 0.000098
eps 26,reward 67,egreedy 0.000098
eps 27,reward 66,egree

eps 0,reward 20,egreedy 0.050000
eps 1,reward 20,egreedy 0.025000
eps 2,reward 18,egreedy 0.012500
eps 3,reward 17,egreedy 0.006250
eps 4,reward 17,egreedy 0.003125
eps 5,reward 70,egreedy 0.001563
eps 6,reward 76,egreedy 0.000781
eps 7,reward 83,egreedy 0.000391
eps 8,reward 83,egreedy 0.000195
target network parameters replaced
eps 9,reward 83,egreedy 0.000098
eps 10,reward 83,egreedy 0.000098
eps 11,reward 83,egreedy 0.000098
eps 12,reward 83,egreedy 0.000098
eps 13,reward 83,egreedy 0.000098
eps 14,reward 83,egreedy 0.000098
eps 15,reward 83,egreedy 0.000098
eps 16,reward 83,egreedy 0.000098
eps 17,reward 83,egreedy 0.000098
eps 18,reward 83,egreedy 0.000098
target network parameters replaced
eps 19,reward 83,egreedy 0.000098
eps 20,reward 83,egreedy 0.000098
eps 21,reward 83,egreedy 0.000098
eps 22,reward 83,egreedy 0.000098
eps 23,reward 83,egreedy 0.000098
eps 24,reward 83,egreedy 0.000098
eps 25,reward 83,egreedy 0.000098
eps 26,reward 83,egreedy 0.000098
eps 27,reward 83,egree

IndexError: list index out of range