In [1]:
from sys import exit, exc_info, argv
import numpy as np
import pandas as pd

!pip3 install git+https://github.com/slremy/netsapi --user --upgrade

from netsapi.challenge import *
from netsapi.visualisation import *


Collecting git+https://github.com/slremy/netsapi
  Cloning https://github.com/slremy/netsapi to /private/var/folders/qj/6zzv4hhx5fzbkm01x5qr69800000gn/T/pip-req-build-vfuuuy4q
  Running command git clone -q https://github.com/slremy/netsapi /private/var/folders/qj/6zzv4hhx5fzbkm01x5qr69800000gn/T/pip-req-build-vfuuuy4q
Building wheels for collected packages: netsapi
  Building wheel for netsapi (setup.py) ... [?25ldone
[?25h  Stored in directory: /private/var/folders/qj/6zzv4hhx5fzbkm01x5qr69800000gn/T/pip-ephem-wheel-cache-a88h15lo/wheels/9e/73/c9/86a9cc2460e11b3ce5b0a5ebd2d9d332a68afe0941659967fa
Successfully built netsapi
Installing collected packages: netsapi
  Found existing installation: netsapi 1.1
    Uninstalling netsapi-1.1:
      Successfully uninstalled netsapi-1.1
Successfully installed netsapi-1.1


In [2]:
from collections import defaultdict

env = ChallengeSeqDecEnvironment()

Q = defaultdict(lambda : 0.) # Q-function
n = defaultdict(lambda : 1.) # number of visits

def actionSpace(resolution):
    x,y = np.meshgrid(np.arange(0,1+resolution,resolution), np.arange(0,1+resolution,resolution))
    xy = np.concatenate((x.reshape(-1,1), y.reshape(-1,1)), axis=1)
    return xy.round(2).tolist()

#HyperParameters
epsilon = 0.1
gamma = 0.8
action_resolution = 0.2
episode_number = 20 #for submission this is fixed as 20


#Set-up
actions = actionSpace(action_resolution)
actionspace = range(len(actions)-1)
greedy_action = lambda s : max(actionspace, key=lambda a : Q[(s,a)])
max_q = lambda sp : max([Q[(sp,a)] for a in actionspace])

#Training of Q Table
for _ in range(episode_number):
    env.reset()
    nextstate = env.state
    while True:
        state = nextstate

        # Epsilon-Greedy
        if epsilon > random.random() :
            action = random.choice(actionspace)
            print('random_action',action)
        else :
            action = greedy_action(state)

        env_action = actions[action] #convert to ITN/IRS
        print('env_action', env_action)
        nextstate, reward, done, _ = env.evaluateAction(env_action)

        # Q-learning
        if done :
            Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward - Q[(state,action)] )
            break
        else :
            Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward + gamma * max_q(nextstate) - Q[(state,action)] )

#Greedy Policy Learnt from Q Table
best_policy = {state: list(actions[greedy_action(state-1)]) for state in range(1,6)}
best_reward = env.evaluatePolicy(best_policy)
print(best_policy, best_reward)

env_action [0.0, 0.0]
105  Evaluations Remaining
env_action [0.0, 0.0]
104  Evaluations Remaining


KeyboardInterrupt: 

In [None]:
class Q_Agent():
    
    def __init__(self, environment):
        
        #Hyperparameters
        self.env = environment
        self.epsilon = 0.1
        self.gamma = 0.8
        self.action_resolution = 0.2
        self.Q = defaultdict(lambda : 0.) # Q-function
        self.n = defaultdict(lambda : 1.) # number of visits
        self.actions = actionSpace(self.action_resolution)
        self.actionspace = range(len(self.actions)-1)
        
    
    def actionSpace(self):
        x,y = np.meshgrid(np.arange(0,1+self.action_resolution,self.action_resolution), np.arange(0,1+self.action_resolution,self.action_resolution))
        xy = np.concatenate((x.reshape(-1,1), y.reshape(-1,1)), axis=1)
        return xy.round(2).tolist()

    def train(self):
        
        Q = self.Q
        n = self.n
        actions = self.actions
        actionspace = self.actionspace

        greedy_action = lambda s : max(actionspace, key=lambda a : Q[(s,a)])
        max_q = lambda sp : max([Q[(sp,a)] for a in actionspace])

        
        for _ in range(20): #Do not change
            
            self.env.reset()
            nextstate = self.env.state
            
            while True:
                state = nextstate

                # Epsilon-Greedy Action Selection
                if epsilon > random.random() :
                    action = random.choice(actionspace)
                else :
                    action = greedy_action(state)

                env_action = actions[action]#convert to ITN/IRS
                print('env_action', env_action)
                nextstate, reward, done, _ = self.env.evaluateAction(env_action)

                # Q-learning
                if done :
                    Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward - Q[(state,action)] )
                    break
                else :
                    Q[(state,action)] = Q[(state,action)] + 1./n[(state,action)] * ( reward + gamma * max_q(nextstate) - Q[(state,action)] )

        return Q


    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        
        Q_trained = self.train()
        greedy_eval = lambda s : max(actionspace, key=lambda a : Q_trained[(s,a)])
        
        best_policy = {state: list(actions[greedy_eval(state-1)]) for state in range(1,6)}
        best_reward = self.env.evaluatePolicy(best_policy)
        
        print(best_policy, best_reward)
        
        return best_policy, best_reward

In [None]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, Q_Agent, "Q_submission.csv")

In [9]:
from netsapi.challenge import *

class CustomAgent:
    def __init__(self, environment):
        self.environment = environment

    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        candidates = []
        try:
            # Agents should make use of 20 episodes in each training run, if making sequential decisions
            for i in range(20):
                self.environment.reset()
                policy = {}
                for j in range(5): #episode length
                    x = 0
                    y = 0
                    if random.random() > 0.4:
                        x = 1
                    else:
                        y = 1
                    policy[str(j+1)]=[x, y]
                candidates.append(policy)
                
            rewards = self.environment.evaluatePolicy(candidates)
            best_policy = candidates[np.argmax(rewards)]
            best_reward = rewards[np.argmax(rewards)]
        
        except (KeyboardInterrupt, SystemExit):
            print(exc_info())
            
        return best_policy, best_reward

In [10]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, CustomAgent, "example.csv")

105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
105  Evaluations Remaining
492.1173918073655


<netsapi.challenge.EvaluateChallengeSubmission at 0x11cac82e8>