### 1. Start the Environment

In [1]:
import numpy as np
from collections import defaultdict
import random
# !pip3 install git+https://github.com/slremy/netsapi --user --upgrade
from netsapi.challenge import * 

In [2]:
env = ChallengeSeqDecEnvironment()

### 2. Examine the State and Action Spaces

In [3]:
num_agents=1
states = env.state
states = np.array([states]).reshape(1, 1)
state_size = states.shape[1]
action_size = 2

In [4]:
print('Size of each action:', action_size)
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states)

Size of each action: 2
There are 1 agents. Each observes a state with length: 1
The state for the first agent looks like: [[1]]


### 3. Take Random Actions in the Environment

In [5]:
env.reset()     # reset the environment    
states = np.array([env.state]).reshape(1, 1) # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
while True:
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, 0, 1)                  # all actions between -1 and 1
    print(actions)
 
    next_states, reward, done, _ = env.evaluateAction(actions[0])           # send all actions to tne environment
    scores += reward                        # update the score (for each agent)
    print("reward=",reward)
    states = next_states                              # roll over states to next time step
    if np.any(done): 
        print(done)
        # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))

[[0.41783116 0.        ]]
105  Evaluations Remaining
reward= 15.722238142315206
[[1.         0.56580304]]
104  Evaluations Remaining
reward= -0.9405778599244843
[[0. 0.]]
103  Evaluations Remaining
reward= 0.2450055236874551
[[0.         0.57310681]]
102  Evaluations Remaining
reward= 34.03852430593001
[[0.         0.93473647]]
101  Evaluations Remaining
reward= 15.078005187860356
True
Total score (averaged over agents) this episode: 64.14319529986855


In [6]:
env.reset()     # reset the environment    
i = 0
policies = []
policy = {}
while True:
    
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, 0, 1)                  # all actions between -1 and 1
    i +=1
    next_states, reward, done, _ = env.evaluateAction(actions[0])           # send all actions to tne environment
    policy[str(i)]=list(actions[0])
    policies.append(policy)
    if done: 
        break
    
print(policies)
reward = env.evaluatePolicy(policies)            
print('results=',reward)
print('sum(results)=',sum(reward))

100  Evaluations Remaining
99  Evaluations Remaining
98  Evaluations Remaining
97  Evaluations Remaining
96  Evaluations Remaining
[{'1': [0.0, 0.0], '2': [0.0, 0.0], '3': [0.0, 0.5839210770664475], '4': [0.8993832428287231, 0.0], '5': [0.5537942725063796, 0.0]}, {'1': [0.0, 0.0], '2': [0.0, 0.0], '3': [0.0, 0.5839210770664475], '4': [0.8993832428287231, 0.0], '5': [0.5537942725063796, 0.0]}, {'1': [0.0, 0.0], '2': [0.0, 0.0], '3': [0.0, 0.5839210770664475], '4': [0.8993832428287231, 0.0], '5': [0.5537942725063796, 0.0]}, {'1': [0.0, 0.0], '2': [0.0, 0.0], '3': [0.0, 0.5839210770664475], '4': [0.8993832428287231, 0.0], '5': [0.5537942725063796, 0.0]}, {'1': [0.0, 0.0], '2': [0.0, 0.0], '3': [0.0, 0.5839210770664475], '4': [0.8993832428287231, 0.0], '5': [0.5537942725063796, 0.0]}]
95  Evaluations Remaining
results= [116.40601806230859, 120.88554933986012, 118.24023935399201, 109.2531951985678, 111.23733421225187]
sum(results)= 576.0223361669804


In [7]:
actions = np.random.randn(1, 2)

### 4. Take Actions with DDPG

In [8]:
a = np.random.randn(num_agents, action_size) 
a[0]

array([-0.33292663,  0.88433108])

In [9]:
np.append(a[0],[4])

array([-0.33292663,  0.88433108,  4.        ])

In [10]:
import torch
from collections import deque
from ddpg_agent import Agent
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
state_size = 1
action_size = 1

In [12]:
class DDPG_Agent:
    def __init__(self, environment):
        self.env = environment
        self.agent = Agent(state_size, action_size, random_seed=0)
    
    def train(self):
        
        for _ in range(20): #Do not change
            self.agent.reset()
            self.env.reset() 
            next_states = np.array([self.env.state]).reshape(1,1)
            t = 0
            while True:
                state = next_states
                actions = np.array(self.agent.act(state),dtype="float64") 
                actions = np.clip(actions, 0, 1)
                print("actions1=",actions)
                next_states, rewards, dones, _ = self.env.evaluateAction([actions[0][0],1-actions[0][0]])  # send the action to the environment  
                next_states = np.array([next_states]).reshape(1,1)
                self.agent.step(t,state, actions, rewards, next_states, dones) 
                t = t+1
                if dones:
                    break
                

    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        self.train()
        best_policy = {}
        for state in range(1,6):
            states = np.array([state]).reshape(1,1)
            actions = np.array(self.agent.act(states),dtype="float64") 
            actions = np.clip(actions, 0, 1)
            best_policy[state] = list([actions[0][0],1-actions[0][0]])

        best_reward = self.env.evaluatePolicy(best_policy)
        
        print(best_policy, best_reward)
    
        return best_policy, best_reward

In [13]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, DDPG_Agent, "ddpg_submission.csv")

Initialising ReplayBuffer
actions1= [[0.43000054]]
105  Evaluations Remaining
actions1= [[0.55639189]]
104  Evaluations Remaining
actions1= [[0.59640002]]
103  Evaluations Remaining
actions1= [[0.59813493]]
102  Evaluations Remaining
actions1= [[0.65002561]]
101  Evaluations Remaining
actions1= [[0.342103]]
100  Evaluations Remaining
actions1= [[0.48684779]]
99  Evaluations Remaining
actions1= [[0.51383579]]
98  Evaluations Remaining
actions1= [[0.57149142]]
97  Evaluations Remaining
actions1= [[0.64180005]]
96  Evaluations Remaining
actions1= [[0.44273874]]
95  Evaluations Remaining
actions1= [[0.51656586]]
94  Evaluations Remaining
actions1= [[0.53480113]]
93  Evaluations Remaining
actions1= [[0.6451534]]
92  Evaluations Remaining
actions1= [[0.71141011]]
91  Evaluations Remaining
actions1= [[0.31121746]]
90  Evaluations Remaining
actions1= [[0.48578459]]
89  Evaluations Remaining
actions1= [[0.6488266]]
88  Evaluations Remaining
actions1= [[0.75295764]]
87  Evaluations Remaining
act

actions1= [[0.28959772]]
50  Evaluations Remaining
actions1= [[0.36197585]]
49  Evaluations Remaining
actions1= [[0.43350282]]
48  Evaluations Remaining
actions1= [[0.58483601]]
47  Evaluations Remaining
actions1= [[0.64277422]]
46  Evaluations Remaining
actions1= [[0.44795841]]
45  Evaluations Remaining
actions1= [[0.51343739]]
44  Evaluations Remaining
actions1= [[0.56704217]]
43  Evaluations Remaining
actions1= [[0.63875574]]
42  Evaluations Remaining
actions1= [[0.60283077]]
41  Evaluations Remaining
actions1= [[0.48977226]]
40  Evaluations Remaining
actions1= [[0.67399514]]
39  Evaluations Remaining
actions1= [[0.68166381]]
38  Evaluations Remaining
actions1= [[0.72178102]]
37  Evaluations Remaining
actions1= [[0.72412288]]
36  Evaluations Remaining
actions1= [[0.49654377]]
35  Evaluations Remaining
actions1= [[0.53780282]]
34  Evaluations Remaining
actions1= [[0.66206229]]
33  Evaluations Remaining
actions1= [[0.77423358]]
32  Evaluations Remaining
actions1= [[0.73225391]]
31  Ev

actions1= [[1.]]
91  Evaluations Remaining
actions1= [[0.81173885]]
90  Evaluations Remaining
actions1= [[0.91583782]]
89  Evaluations Remaining
actions1= [[0.93693095]]
88  Evaluations Remaining
actions1= [[0.99153191]]
87  Evaluations Remaining
actions1= [[1.]]
86  Evaluations Remaining
actions1= [[0.70913321]]
85  Evaluations Remaining
actions1= [[0.93646449]]
84  Evaluations Remaining
actions1= [[1.]]
83  Evaluations Remaining
actions1= [[1.]]
82  Evaluations Remaining
actions1= [[1.]]
81  Evaluations Remaining
actions1= [[0.68901783]]
80  Evaluations Remaining
actions1= [[0.93943501]]
79  Evaluations Remaining
actions1= [[1.]]
78  Evaluations Remaining
actions1= [[1.]]
77  Evaluations Remaining
actions1= [[1.]]
76  Evaluations Remaining
actions1= [[0.69166833]]
75  Evaluations Remaining
actions1= [[0.93908769]]
74  Evaluations Remaining
actions1= [[1.]]
73  Evaluations Remaining
actions1= [[1.]]
72  Evaluations Remaining
actions1= [[1.]]
71  Evaluations Remaining
actions1= [[0.753

actions1= [[1.]]
11  Evaluations Remaining
actions1= [[1.]]
10  Evaluations Remaining
actions1= [[1.]]
9  Evaluations Remaining
actions1= [[1.]]
8  Evaluations Remaining
actions1= [[1.]]
7  Evaluations Remaining
actions1= [[1.]]
6  Evaluations Remaining
5  Evaluations Remaining
{1: [1.0, 0.0], 2: [1.0, 0.0], 3: [1.0, 0.0], 4: [1.0, 0.0], 5: [1.0, 0.0]} 93.7754434259049
actions1= [[1.]]
105  Evaluations Remaining
actions1= [[1.]]
104  Evaluations Remaining
actions1= [[1.]]
103  Evaluations Remaining
actions1= [[1.]]
102  Evaluations Remaining
actions1= [[1.]]
101  Evaluations Remaining
actions1= [[1.]]
100  Evaluations Remaining
actions1= [[1.]]
99  Evaluations Remaining
actions1= [[1.]]
98  Evaluations Remaining
actions1= [[1.]]
97  Evaluations Remaining
actions1= [[1.]]
96  Evaluations Remaining
actions1= [[1.]]
95  Evaluations Remaining
actions1= [[1.]]
94  Evaluations Remaining
actions1= [[1.]]
93  Evaluations Remaining
actions1= [[1.]]
92  Evaluations Remaining
actions1= [[1.]]
91 

actions1= [[1.]]
27  Evaluations Remaining
actions1= [[1.]]
26  Evaluations Remaining
actions1= [[1.]]
25  Evaluations Remaining
actions1= [[1.]]
24  Evaluations Remaining
actions1= [[1.]]
23  Evaluations Remaining
actions1= [[1.]]
22  Evaluations Remaining
actions1= [[1.]]
21  Evaluations Remaining
actions1= [[1.]]
20  Evaluations Remaining
actions1= [[1.]]
19  Evaluations Remaining
actions1= [[1.]]
18  Evaluations Remaining
actions1= [[1.]]
17  Evaluations Remaining
actions1= [[1.]]
16  Evaluations Remaining
actions1= [[1.]]
15  Evaluations Remaining
actions1= [[1.]]
14  Evaluations Remaining
actions1= [[1.]]
13  Evaluations Remaining
actions1= [[1.]]
12  Evaluations Remaining
actions1= [[1.]]
11  Evaluations Remaining
actions1= [[1.]]
10  Evaluations Remaining
actions1= [[1.]]
9  Evaluations Remaining
actions1= [[1.]]
8  Evaluations Remaining
actions1= [[1.]]
7  Evaluations Remaining
actions1= [[1.]]
6  Evaluations Remaining
5  Evaluations Remaining
{1: [1.0, 0.0], 2: [1.0, 0.0], 3:

actions1= [[1.]]
42  Evaluations Remaining
actions1= [[1.]]
41  Evaluations Remaining
actions1= [[1.]]
40  Evaluations Remaining
actions1= [[1.]]
39  Evaluations Remaining
actions1= [[1.]]
38  Evaluations Remaining
actions1= [[1.]]
37  Evaluations Remaining
actions1= [[1.]]
36  Evaluations Remaining
actions1= [[1.]]
35  Evaluations Remaining
actions1= [[1.]]
34  Evaluations Remaining
actions1= [[1.]]
33  Evaluations Remaining
actions1= [[1.]]
32  Evaluations Remaining
actions1= [[1.]]
31  Evaluations Remaining
actions1= [[1.]]
30  Evaluations Remaining
actions1= [[1.]]
29  Evaluations Remaining
actions1= [[1.]]
28  Evaluations Remaining
actions1= [[1.]]
27  Evaluations Remaining
actions1= [[1.]]
26  Evaluations Remaining
actions1= [[1.]]
25  Evaluations Remaining
actions1= [[1.]]
24  Evaluations Remaining
actions1= [[1.]]
23  Evaluations Remaining
actions1= [[1.]]
22  Evaluations Remaining
actions1= [[1.]]
21  Evaluations Remaining
actions1= [[1.]]
20  Evaluations Remaining
actions1= [

<netsapi.challenge.EvaluateChallengeSubmission at 0x2ba602602e8>