In [1]:
# The basics
%matplotlib inline
import pickle 

import matplotlib.pyplot as plt
import time
import itertools
import matplotlib
import pickle
import pandas as pd

import numpy as np
import sys
import os
import collections
from collections import namedtuple

#import environment
import sys
sys.path.append(r'../virl')
import virl

from policy_search_ds import *

Using TensorFlow backend.


3.6.12 |Anaconda, Inc.| (default, Sep  9 2020, 00:29:25) [MSC v.1916 64 bit (AMD64)]


In [2]:
class NeuralNetworkPolicyEstimator():
    """ 
    A very basic MLP neural network approximator and estimator for poliy search    
    
    The only tricky thing is the traning/loss function and the specific neural network arch
    """
    
    def __init__(self, alpha, n_actions, d_states, nn_config, verbose=False):        
        self.alpha = alpha    
        self.nn_config = nn_config                   
        self.n_actions = n_actions        
        self.d_states = d_states
        self.verbose=verbose # Print debug information        
        self.n_layers = len(nn_config)  # number of hidden layers        
        self.model = []
        self.__build_network(d_states, n_actions)
        self.__build_train_fn()
             

    def __build_network(self, input_dim, output_dim):
        """Create a base network usig the Keras functional API"""
        self.X = layers.Input(shape=(input_dim,))
        net = self.X
        for h_dim in nn_config:
            net = layers.Dense(h_dim)(net)
            net = layers.Activation("relu")(net)
        net = layers.Dense(output_dim, kernel_initializer=initializers.Zeros())(net)
        net = layers.Activation("softmax")(net)
        self.model = Model(inputs=self.X, outputs=net)

    def __build_train_fn(self):
        """ Create a custom train function
        It replaces `model.fit(X, y)` because we use the output of model and use it for training.        
        Called using self.train_fn([state, action_one_hot, target])`
        which would train the model. 
        
        Hint: you can think of K. as np.
        
        """
        # predefine a few variables
        action_onehot_placeholder   = K.placeholder(shape=(None, self.n_actions),name="action_onehot") # define a variable
        target                      = K.placeholder(shape=(None,), name="target") # define a variable       
        
        # this part defines the loss and is very important!
        action_prob        = self.model.output # the outlout of the neural network        
        action_selected_prob        = K.sum(action_prob * action_onehot_placeholder, axis=1) # probability of the selcted action        
        log_action_prob             = K.log(action_selected_prob) # take the log
        loss = -log_action_prob * target # the loss we are trying to minimise
        loss = K.mean(loss)
        
        # defining the speific optimizer to use
        adam = optimizers.Adam(lr=self.alpha)# clipnorm=1000.0) # let's use a kereas optimiser called Adam
        updates = adam.get_updates(params=self.model.trainable_weights,loss=loss) # what gradient updates to we parse to Adam
            
        # create a handle to the optimiser function    
        self.train_fn = K.function(inputs=[self.model.input,action_onehot_placeholder,target],
                                   outputs=[],
                                   updates=updates) # return a function which, when called, takes a gradient step
      
    
    def predict(self, s, a=None):              
        if a==None:            
            return self._predict_nn(s)
        else:                        
            return self._predict_nn(s)[a]
        
    def _predict_nn(self,state_hat):                          
        """
        Implements a basic MLP with tanh activations except for the final layer (linear)               
        """                
        x = self.model.predict(state_hat)                                                    
        return x
  
    def update(self, states, actions, target):  
        """
            states: a interger number repsenting the discrete state
            actions: a interger number repsenting the discrete action
            target: a real number representing the discount furture reward, reward to go
            
        """
        action_onehot = np_utils.to_categorical(actions, num_classes=self.n_actions) # encodes the state as one-hot
        self.train_fn([states, action_onehot, target]) # call the custom optimiser which takes a gradient step
        return 
        
    def new_episode(self):        
        self.t_episode  = 0. 


        
    

In [3]:
tabular = {}
for i in range(1, 8):
    for j in range(1, 8):
        for k in range(1, 8):
            for n in range(1, 8):
                tabular[(i, j, k, n)]=len(tabular)
                
f = open("./data/tabular.dat", 'wb')
pickle.dump(tabular, f)
f.close()

In [4]:
"Train the model and store with every problem_id "
for problem_id in range(10):
    env = virl.Epidemic(problem_id=problem_id, stochastic=False, noisy=False)

    # Instantiate a PolicyEstimator (i.e. the function-based approximation)
    alpha      = 0.002  
    n_states = len(tabular)
    n_actions = env.action_space.n
    nn_config  = [] # number of neurons in the hidden layers, should be [] as default

    policy_estimator = NeuralNetworkPolicyEstimator(alpha, n_actions, n_states, nn_config)
    policy_estimator.model.save_weights("./data/model/model{}.h5".format(problem_id))

    stats = reinforce(env, policy_estimator, 1200, tabular, discount_factor=0.99)
    stats_storage = Stats_storage(stats)

    f = open("./data/data_train/train{}".format(problem_id), 'wb')
    pickle.dump(stats_storage, f)
    f.close()













Step 51 @ Episode 1200/1200 (-1.3827911189310096))

In [5]:
"TEST when S=1 N=0"
for problem_id in range(10):
       
    env = virl.Epidemic(problem_id=problem_id, stochastic=True, noisy=False)

    # Instantiate a PolicyEstimator (i.e. the function-based approximation)
    alpha      = 0  
    n_states = len(tabular)
    n_actions = env.action_space.n
    nn_config  = [] # number of neurons in the hidden layers, should be [] as default

    policy_estimator = NeuralNetworkPolicyEstimator(alpha, n_actions, n_states, nn_config)
    policy_estimator.model.load_weights
    
    stats = reinforce(env, policy_estimator, 200, tabular, discount_factor=0.99)
    stats_storage = Stats_storage(stats)

    f = open("./data/data_test/test_S1_N0{}".format(problem_id), 'wb')
    pickle.dump(stats_storage, f)
    f.close()

Step 51 @ Episode 200/200 (-2.707947622380828)))

In [6]:
"TEST when S=1 N=1"
for problem_id in range(10):
       
    env = virl.Epidemic(problem_id=problem_id, stochastic=True, noisy=True)

    # Instantiate a PolicyEstimator (i.e. the function-based approximation)
    alpha      = 0  
    n_states = len(tabular)
    n_actions = env.action_space.n
    nn_config  = [] # number of neurons in the hidden layers, should be [] as default

    policy_estimator = NeuralNetworkPolicyEstimator(alpha, n_actions, n_states, nn_config)
    policy_estimator.model.load_weights
    
    stats = reinforce(env, policy_estimator, 200, tabular, discount_factor=0.99)
    stats_storage = Stats_storage(stats)

    f = open("./data/data_test/test_S1_N1{}".format(problem_id), 'wb')
    pickle.dump(stats_storage, f)
    f.close()

Step 51 @ Episode 200/200 (-0.7582693500564679)

In [7]:
"TEST when S=0 N=1"
for problem_id in range(10):
       
    env = virl.Epidemic(problem_id=problem_id, stochastic=False, noisy=True)

    # Instantiate a PolicyEstimator (i.e. the function-based approximation)
    alpha      = 0  
    n_states = len(tabular)
    n_actions = env.action_space.n
    nn_config  = [] # number of neurons in the hidden layers, should be [] as default

    policy_estimator = NeuralNetworkPolicyEstimator(alpha, n_actions, n_states, nn_config)
    policy_estimator.model.load_weights
    
    stats = reinforce(env, policy_estimator, 200, tabular, discount_factor=0.99)
    stats_storage = Stats_storage(stats)

    f = open("./data/data_test/test_S0_N1{}".format(problem_id), 'wb')
    pickle.dump(stats_storage, f)
    f.close()

Step 51 @ Episode 200/200 (-2.1089504019081216))

In [8]:
"TEST when S=0 N=0"
for problem_id in range(10):
       
    env = virl.Epidemic(problem_id=problem_id, stochastic=False, noisy=False)

    # Instantiate a PolicyEstimator (i.e. the function-based approximation)
    alpha      = 0  
    n_states = len(tabular)
    n_actions = env.action_space.n
    nn_config  = [] # number of neurons in the hidden layers, should be [] as default

    policy_estimator = NeuralNetworkPolicyEstimator(alpha, n_actions, n_states, nn_config)
    policy_estimator.model.load_weights
    
    stats = reinforce(env, policy_estimator, 200, tabular, discount_factor=0.99)
    stats_storage = Stats_storage(stats)

    f = open("./data/data_test/test_S0_N0{}".format(problem_id), 'wb')
    pickle.dump(stats_storage, f)
    f.close()

Step 51 @ Episode 200/200 (-2.1341014432138006))