In [None]:
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import sys
if "../" not in sys.path:
  sys.path.append("../")
from lib.envs.slicing_env import SlicingEnvironment
from lib.agents.tforce import TensorforceAgent
from tensorforce import Environment, Agent
from lib import utils
from scipy.stats import poisson
matplotlib.style.use('ggplot')

In [None]:
# Configuring the simulation
np.random.seed(2021)

# Number of DRL agent timesteps per episode 
max_episode_timesteps = 2

total_data_episodes = 1

# Number of DRL agent episodes (we are doing episodes just for the sake of better results visulization)

total_episodes = 50000


# number of users per slice in the following order: VoLTE, Video, URLLC
num_users = [int(46/4), int(46/4), int(8/4)]

poisson_volte = np.full((1, 200), 1)
poisson_video = np.full((1, 200), 1)
poisson_urllc = np.full((1, 200), 1)

max_num_users = [max(poisson_volte[0]), max(poisson_video[0]), max(poisson_urllc[0])]

num_users_poisson = [poisson_video[0], poisson_volte[0], poisson_urllc[0]]


agent_name = 'ppo'
learning_type = 'non_accelerated'

max_traffic_percentage = 1
num_action_lvls = 15
num_slices = 3
sl_win_size = 40
time_quantum = 1


max_size_per_tti = 40
max_num_packets = 0

max_trans_per_tti = 6

c1_volte = 0.5
c2_volte = 10
c1_urllc = 2
c2_urllc = 3
c1_video = 1
c2_video = 7

num_traffic_var = 2

discount_factor=0.5
epsilon=0.9
epsilon_decay=0.99
decay_steps=16000
loaded_qtable='no'
batch_size = 4
memory = 20000
device='CPU'

random_seed = 2021 
learning_rate = 0.001
final_value = 0.01
epsilon_every_x_steps = 50
max_epsilon_steps = 18000
reward_function_type = 'simple'
traffic_pattern = 4

In [None]:
traffic_df = utils.generate_data(max_num_users[0], max_num_users[1], 
                                 max_num_users[2], sl_win_size*max_episode_timesteps, traffic_pattern)
traffic_df = traffic_df.reset_index(drop=True)

In [None]:
traffic_df[traffic_df['type'] == 'volte'].head()

In [None]:
w_volte = 0.1
w_urllc = 0.7
w_video = 0.2

enviro = SlicingEnvironment(traffic_df, max_num_packets, max_size_per_tti, num_action_lvls, 
                     num_slices, max_episode_timesteps, sl_win_size, time_quantum,total_data_episodes,
                     num_users_poisson, max_traffic_percentage, max_trans_per_tti, w_volte, w_urllc,
                        w_video, c1_volte, c1_urllc, c1_video, c2_volte, c2_urllc, c2_video, num_traffic_var,
                           reward_function_type)


environment = Environment.create(
        environment=enviro, max_episode_timesteps=max_episode_timesteps
    )


slicing_agent = TensorforceAgent(agent_name, environment, batch_size , memory, epsilon, epsilon_decay,
                                loaded_qtable, decay_steps, device, learning_rate, final_value, 
                                random_seed, discount_factor)

# choose algorithm and create tensorforce agent
ep_rewards = {}
step = 0

In [None]:
slicing_agent.agent.config.seed

In [None]:
for episode in range(total_episodes):
    if (episode >= decay_steps ):
        epsilon = final_value
    if (episode >= max_epsilon_steps ):
        epsilon = 0
    # Record episode experience
    episode_states = list()
    episode_internals = list()
    episode_actions = list()
    episode_terminal = list()
    episode_reward = list()
    states = environment.reset()

    # Episode using independent-act and agent.intial_internals()
    internals = slicing_agent.agent.initial_internals()
    terminal = False
    sum_rewards = 0.0
    
    while not terminal:
        print('Episode: ' + str(episode) + ', Step: ' + str(step))
        print("---current observation: ", states)
        print("---agent algorithm: ", slicing_agent.agent_algorithm)
        episode_states.append(states)
        episode_internals.append(internals)
        actions, internals = slicing_agent.agent.act(states=states, internals=internals, independent=True,\
                                                    deterministic=False)

        # epsilon greedy
        print('epsilon is: ', epsilon)
        p = np.random.random()
        if p < epsilon:
            actions = np.random.choice(num_action_lvls)

        if ( (0 < episode < decay_steps) and (episode%epsilon_every_x_steps==0) and (epsilon>final_value)):
            epsilon = epsilon * epsilon_decay

        print("---agent action: ", actions)
        episode_actions.append(actions)
        states, terminal, reward = environment.execute(actions=actions)
        print("---reward, done: ", reward, terminal)
        episode_terminal.append(terminal)
        episode_reward.append(reward)
        sum_rewards += reward
        print ('cumulative episode reward: ', sum_rewards)
        step += 1

    print('Episode {}: {}'.format(episode, sum_rewards))
    ep_rewards[episode] = sum_rewards
    print("End episode: ", episode)
    print ('episode total reward: ', ep_rewards[episode])

    # Feed recorded experience to agent
    slicing_agent.agent.experience(
        states=episode_states, internals=episode_internals, actions=episode_actions,
        terminal=episode_terminal, reward=episode_reward
    )

    # Perform update
    slicing_agent.agent.update()

In [None]:
reward_list = list(ep_rewards.values())

In [None]:
slicing_rewards = [sum(reward_list[i:i+1000]) for i in range(0, len(reward_list), 1000)]
slicing_rewards_temp = slicing_rewards.copy()
slicing_rewards_temp.extend([slicing_rewards_temp[-1]]*400)
temp_np_slicing = np.array(slicing_rewards_temp)/max(slicing_rewards_temp)
normalized_np_slicing = temp_np_slicing*100

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
plt.style.use('seaborn')
plt.title('Learning Performance (' + str(slicing_agent.agent_algorithm) + ' agent)')
plt.plot(np.arange(1,len(normalized_np_slicing)+1)[0:20]*1000, normalized_np_slicing[0:20], label='Reward', marker="", linestyle="-")#, color='k')
plt.xlabel('Learning Step')
plt.ylabel('Rewards')
plt.legend(prop={'size': 12})