In [None]:
import shutil
import os
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf

import sys
sys.path.append(os.path.abspath("../framework"))
sys.path.append(os.path.abspath("../concrete"))

In [None]:
from ConcAction import ConcAction
from ConcObservation import ConcObservation
from ConcAgentFactory import ConcAgentFactory
from ConcBuildOrder import ConcBuildOrder
from ConcEnvrionmentFactory import ConcEnvironmentFactory
from ConcEnvironment import ConcEnvironment
from ConcRewardGiverFactory import ConcRewardGiverFactory
from ConcTrainerFactory import ConcTrainerFactory
from ConcValueFunctionApproximatorFactory import ConcValueFunctionApproximatorFactory
from framework import Trainer, ObservationSequence

# Check the history over the simulation

In [None]:
def retrieveYURfromTrainer(trainer):

    Y = []
    for observationSequence in trainer.historyObservationSequences:
        y = observationSequence[-1].getValue() # (1, nPv)
        Y.append(y)
    Y = np.concatenate(Y, axis=0) # (*, nPv)

    U = []
    for action in trainer.historyActions:
        u = action.getValue() # (1, nMv)
        U.append(u)
    U = np.concatenate(U, axis=0) # (*, nMv)

    R = []
    for reward in trainer.historyRewards:
        r = reward.getValue() # (1,)
        R.append(r)
    R = np.concatenate(R, axis=0) # (*,)
    
    return Y, U, R

In [None]:
buildOrder = ConcBuildOrder(nIteration=100
                            , nSeq=1
                            , nHorizonValueOptimization=100
                            , nIntervalPolicyOptimization=100
                            , nBatchPolicyOptimization=2**5
                            , nSaveInterval=2**5
                            , description="test"
                            , tConstant = 10
                            , nHiddenValueApproximator = 2**5
                            , sdPolicy = 0.0
                            , nActionsSampledFromPolicy = 2**0
                            , amplitudeDv = 0.0
                            )

agent = ConcAgentFactory().create(buildOrder)
environment = ConcEnvironmentFactory().create(buildOrder)
valueFunctionApproximator = ConcValueFunctionApproximatorFactory().create(buildOrder)
rewardGiver = ConcRewardGiverFactory().create(buildOrder)

trainerFactory = ConcTrainerFactory()
trainer = trainerFactory.create(agent, environment, valueFunctionApproximator, rewardGiver, buildOrder)

In [None]:
trainer.init()
trainer.train(1)
agent.gainP.weights[0].assign(np.zeros((1,ConcEnvironment.nMv)))
agent.gainP.weights[1].assign(np.zeros((ConcEnvironment.nMv,)))
trainer.train(2**6)

In [None]:
Y, U, R = retrieveYURfromTrainer(trainer)

fig = plt.figure()
for k1, x in enumerate((Y, U, R)):
    ax = fig.add_subplot(3,1,k1+1)
    ax.plot(x)
plt.tight_layout()

# Check the history of ValueFunction without policy update

In [None]:
Nitr = 2**12

buildOrder = ConcBuildOrder(nIteration=Nitr
                            , nSeq=1
                            , nHorizonValueOptimization=2**3
                            , nIntervalPolicyOptimization=Nitr
                            , nBatchPolicyOptimization=2**5
                            , nSaveInterval=2**5
                            , description="test"
                            , tConstant = 10
                            , nHiddenValueApproximator = 2**3
                            , sdPolicy = 1.0
                            , nActionsSampledFromPolicy = 2**0                      
                            )

agent = ConcAgentFactory().create(buildOrder)
environment = ConcEnvironmentFactory().create(buildOrder)
valueFunctionApproximator = ConcValueFunctionApproximatorFactory().create(buildOrder)
rewardGiver = ConcRewardGiverFactory().create(buildOrder)

trainerFactory = ConcTrainerFactory()
trainer = trainerFactory.create(agent, environment, valueFunctionApproximator, rewardGiver, buildOrder)

In [None]:
trainer.init()
trainer.train(1)

agent.gainP.weights[0].assign(np.zeros((1,ConcEnvironment.nMv)))
agent.gainP.weights[1].assign(np.zeros((ConcEnvironment.nMv,)))

trainer.train(buildOrder.nIteration)

In [None]:
u0 =  np.zeros((1, ConcEnvironment.nMv)) # (1, nMv)
u1p =  0.5 * np.ones((1, ConcEnvironment.nMv)) # (1, nMv)
u1n =  -0.5 * np.ones((1, ConcEnvironment.nMv)) # (1, nMv)

y0 = np.zeros((1, ConcEnvironment.nPv)).astype(np.float32) # (1, nPv)
y1p = 0.5 * np.ones((1, ConcEnvironment.nPv)).astype(np.float32) # (1, nPv)
y1n = -0.5 * np.ones((1, ConcEnvironment.nPv)).astype(np.float32) # (1, nPv)

Under u = 0, compare the value with y = 0, +1, -1, respectively.

In [None]:
action = ConcAction(u0)
observation = ConcObservation(y0)
observationSequence = ObservationSequence()
observationSequence.add(observation)

value = valueFunctionApproximator(observationSequence, action)
_aValue, _sValue = value.getValue() # (1, 1)
print(_aValue, _sValue)

In [None]:
action = ConcAction(u0)
observation = ConcObservation(y1p)
observationSequence = ObservationSequence()
observationSequence.add(observation)

value = valueFunctionApproximator(observationSequence, action)
_aValue, _sValue = value.getValue() # (1, 1)
print(_aValue, _sValue)

In [None]:
action = ConcAction(u0)
observation = ConcObservation(y1n)
observationSequence = ObservationSequence()
observationSequence.add(observation)

value = valueFunctionApproximator(observationSequence, action)
_aValue, _sValue = value.getValue() # (1, 1)
print(_aValue, _sValue)

# Check the history of policy update

In [None]:
nIntervalPolicyOptimization = 2**3
buildOrder = ConcBuildOrder(nIteration=2**10
                            , nSeq=1
                            , nHorizonValueOptimization=nIntervalPolicyOptimization//2
                            , nIntervalPolicyOptimization=nIntervalPolicyOptimization
                            , nBatchPolicyOptimization=2**5
                            , nSaveInterval=2**5
                            , description="test"
                            , tConstant = 10
                            , nHiddenValueApproximator = 2**3
                            , sdPolicy = 0.1
                            , nActionsSampledFromPolicy = 2**0
                            )

agent = ConcAgentFactory().create(buildOrder)
environment = ConcEnvironmentFactory().create(buildOrder)
valueFunctionApproximator = ConcValueFunctionApproximatorFactory().create(buildOrder)
rewardGiver = ConcRewardGiverFactory().create(buildOrder)

trainerFactory = ConcTrainerFactory()
trainer = trainerFactory.create(agent, environment, valueFunctionApproximator, rewardGiver, buildOrder)

In [None]:
trainer.init()
trainer.train(1)

agent.gainP.weights[0].assign(np.zeros((1,ConcEnvironment.nMv)))
agent.gainP.weights[1].assign(np.zeros((ConcEnvironment.nMv,)))

Gain = []
Bias = []
for k1 in range(2**10):
    sys.stdout.write('\r%04d' % k1)
    gain = agent.gainP.weights[0].numpy() # (1, nMv)
    bias = agent.gainP.weights[1].numpy() # (nMv, )
    Gain.append(gain)
    Bias.append(bias)
    trainer.train(nIntervalPolicyOptimization)

In [None]:
Gain = np.concatenate(Gain, axis=0) # (*, nMv)
Bias = np.stack(Bias, axis=0) # (*, nMv)

In [None]:
plt.figure()
plt.subplot(2,1,1)
plt.plot(Gain)
plt.subplot(2,1,2)
plt.plot(Bias)
plt.tight_layout()