In [1]:
import numpy as np
import sympy as sp
import import_ipynb

# import ipdb
# from sklearn.preprocessing import StandardScaler
# import tensorflow as tf
# import tensorflow.compat.v1 as tfc
# tf.compat.v1.disable_eager_execution()
# from tensorflow.keras import optimizers
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Activation
# import tensorflow.keras.backend as K
# import tensorflow.compat.v1.keras.backend as Kc

# from Environment import Simple1Room
# from Environment import offline_sim



class Stochastic_AC_PG:
    """
    a class for actor-critic stochastic policy-gradient RL
    
    a Stochastic_AC_PG object is used to implement an actor-critic 
    stochastic policy-gradient RL with eligibility traces that can 
    handle both types of average rewards: average reward per time step &
    average reward per step
    
    ...
    Attributes
    ----------
    Td: float
        the user-defined desired temperature
    learning_rate_policy_mu: float
        learning rate for policy mean. It should take a value in [0,1]
        (default 0.01)
    learning_rate_policy_sigma: float
        learning rate for policy variance. It should take a value in 
        [0,1] (default 0.01)
    learning_rate_valuefunc: float
        learning rate for state value function. It should take a value
        in [0,1] (default 0.01)
    learning_rate_avereward: float
        learning rate for average reward. It should take a value in
        [0,1] (default 0.01)
    elig_decay_policy: float
        eligibilit decay parameter for policy. It should take a value in
        [0,1] (default 0.2)
    elig_decay_valuefunc: float
        eligibilit decay parameter for state-value function. It should 
        take a value in [0,1] (default 0.2)
    T_upperbound: float
        upper limit of switch-OFF temperature. (default Td+5)
    T_lowerbound: float
        lower limit of switch-ON temperature. (default Td-5)
    PerTimeStep: bool
        if set to True average reward per time step is used, otherwise
        if set to False average reward per step is used (default True)
    reward_decay: float
        reward decay parameter. It should take a value in [0,1] 
        (default 1)
    n_features: int
        number of features for the feature vector (default 2)
    
    Methods
    -------
    initialize():
        initialized all the system parameters that we want to learn
    choose_action(state):
        chooses an action (temp. threshold) from a Gaussian distribution
    learn(S, A, Sp, R, delta_time):
        updates all the system parameters we want to learn
    value_func(state):
        evaluates the state-value function at a given state
    value_func_grad(state):
        evaluates the state-value function gradient wrt its parameters 
        at a given state
    policy_func_grad(state, A):
        evaluates the policy (its log) gradient wrt its parameters at a 
        given state and action
    feature_vec(cls,state):
    this outputs a feature vector
    
    """


    def __init__(self,
                 l1_upperbound,
                 l1_lowerbound,
                 theta_mu0 = None,
                 learning_rate_policy_mu=0.01,
                 learning_rate_policy_sigma=0.01,
                 learning_rate_valuefunc=0.01,
                 learning_rate_avereward=0.01,
                 elig_decay_policy=0.2,
                 elig_decay_valuefunc=0.2,
                 policy_update_freq=1,
                 PerTimeStep=True,
                 reward_decay=1,
                 n_features=2):

        self.l1up = l1_upperbound
        self.l1low = l1_lowerbound
        self.alpha_theta_mu = learning_rate_policy_mu
        self.alpha_theta_sigma = learning_rate_policy_sigma
        self.alpha_w = learning_rate_valuefunc
        self.eta = learning_rate_avereward
        self.gamma = reward_decay
        self.lambda_theta = elig_decay_policy
        self.lambda_w = elig_decay_valuefunc
        self.PerTimeStep = PerTimeStep
        self.n_features = n_features
        self.policy_update_freq=policy_update_freq
        if theta_mu0 is None:
            self.theta_mu0 = self.normalize_action_func(1.0)
        else:
            self.theta_mu0 = self.normalize_action_func(theta_mu0)
            
        
    def initialize(self):
        """
        initializes all the parameters.
        
        initialized values:
        theta_sigma = 0.0
        theta_mu = [Td-3, Td+3]
        w_valuefunc = [np.random.uniform(low=-1, high=1.0, size=None),
                       np.random.uniform(low=-1, high=1.0, size=None)]
        Rave = 0.0
        
        all the initial values of eligibility traces are set to 0.0
        """
        
        # Initializing parameters of the policy (theta_mu = [theta_ON,
        # theta_OFF], theta_sigma) and value function (w=[w_OFF, w_ON]) as well
        # as the average reward (Rave)

        self.theta_sigma = 0.0
#         theta_ON = self.Td - 4.0
#         theta_OFF = self.Td + 4.0
#         self.theta_mu = self.l1low
#         self.theta_mu = self.normalize_action_func(self.l1low)
        
        self.theta_mu = self.theta_mu0
        
        self.z_theta_mu = 0.0
        self.z_theta_sigma = 0.0
        self.z_w = 0.0
        self.Rave = 0
        
#         w_ON = np.random.uniform(low=-1, high=1.0, size=None)
#         w_OFF = np.random.uniform(low=-1, high=1.0, size=None)
#         self.w_valuefunc = np.array([w_OFF, w_ON])
        
        self.w_valuefunc = np.random.uniform(low=-1, high=1.0, size=None)
        
        self.policy_update_ind = 0

        return self.Rave, self.theta_mu, self.theta_sigma, self.w_valuefunc

    def choose_action(self, state):
        """
        chooses an action from a Gaussian distribution
        
        an action, a temperature threshold, is sampled from a Gaussian 
        distribution whose mean and variance are calculated based on the
        theta_mu and theta_sigma and a feature state vector.
        """
        
        # choosing action (threshold temperatures)
#         T, hs, aT, zT = state
#         feature_vec = Stochastic_AC_PG.feature_vec(state)
#         mu = np.dot(self.theta_mu, feature_vec)
        
        mu = self.theta_mu
        sigma = np.exp(self.theta_sigma)
        
        while True:
            action = np.random.normal(mu, sigma)
            print('l1 is:',action)
            if action>self.normalize_action_func(self.l1low) and action<self.normalize_action_func(self.l1up):
                break
                
        return action

    def learn(self, S, A, Sp, R, delta_time):
        """
        this function updates all the system parameters we want to learn
        
        this function takes the initial state (S) and the action taken 
        at S (A) as well as the next state (Sp) and the reward (R) and
        the transition time (delta_time), and use them all to update all
        the system parameters we want to learn (those initialized under
        the initialization function)
        """
        
        self.policy_update_ind = self.policy_update_ind + 1

#         if self.PerTimeStep:
#             delt = R - self.Rave * delta_time + self.value_func(
#                 Sp) - self.value_func(S)
#             self.Rave = self.Rave + self.eta * delt / delta_time
#         else:
#             delt = R - self.Rave + self.value_func(Sp) - self.value_func(S)
#             self.Rave = self.Rave + self.eta * delt

        
        delt = R - self.Rave + self.value_func(Sp) - self.value_func(S)
        self.Rave = self.Rave + self.eta * delt
        
#         delt = R + self.value_func(Sp) - self.value_func(S)
    
        dV_dw = self.value_func_grad(S)
        dlnPi_dtheta_mu = self.policy_func_grad(S, A)[0]
        dlnPi_dtheta_sigma = self.policy_func_grad(S, A)[1]

        self.z_w = self.lambda_w * self.z_w + dV_dw
        self.z_theta_mu = self.lambda_theta * self.z_theta_mu + dlnPi_dtheta_mu
        self.z_theta_sigma = self.lambda_theta * self.z_theta_sigma +\
                              dlnPi_dtheta_sigma

        self.w_valuefunc = self.w_valuefunc + self.alpha_w * delt * self.z_w
        
        if self.policy_update_ind % self.policy_update_freq == 0:
#             self.theta = self.theta + self.alpha_theta * dmu_dtheta * dQ_da
            self.theta_mu = self.theta_mu +\
                            self.alpha_theta_mu * delt * self.z_theta_mu
            self.theta_sigma = self.theta_sigma +\
                               self.alpha_theta_sigma * delt * self.z_theta_sigma

        #         ipdb.set_trace()

        return self.Rave, self.theta_mu, self.theta_sigma, self.w_valuefunc

    def value_func(self, state):
        """
        this function evaluates the state-value function at a given
        state
        """
        
#         T, hs, aT, zT = state
#         feature_vec = Stochastic_AC_PG.feature_vec(state)
#         np.dot(self.w_valuefunc, feature_vec)
        return self.w_valuefunc

    def value_func_grad(self, state):
        """
        this function evaluates the state-value function gradient wrt
        its parameters at a given state
        """
        
#         T, hs, aT, zT = state
#         feature_vec = Stochastic_AC_PG.feature_vec(state)
#         dVdw = feature_vec
        dVdw = 1.0
        return dVdw

    def policy_func_grad(self, state, A):
        """
        this function evaluates the policy (its log) gradient wrt
        its parameters at a given state and action
        """
        
#         T, hs, aT, zT = state
#         feature_vec = Stochastic_AC_PG.feature_vec(state)
#         mu = np.dot(self.theta_mu, feature_vec)
        
        mu = self.theta_mu
        sigma = np.exp(self.theta_sigma)

        #         ipdb.set_trace()

        m, s = sp.symbols('m s')
        lnPi = sp.log(
            (1 / (s * sp.sqrt(2 * sp.pi))) * sp.exp(-(A - m)**2 / (2 * s**2)))

        dlnPi_dmu = sp.diff(lnPi, m)
        dlnPi_dsigma = sp.diff(lnPi, s)

        dlnPi_dmu_calc = sp.lambdify((m, s), dlnPi_dmu, 'numpy')
        dlnPi_dsigma_calc = sp.lambdify((m, s), dlnPi_dsigma, 'numpy')

        dlnPi_dtheta = np.array([dlnPi_dmu_calc(mu, sigma),\
                                 dlnPi_dsigma_calc(mu, sigma) * sigma])
        return dlnPi_dtheta

#     @classmethod
    def normalize_action_func(self, x):
        """ this normalizes the action"""
        
        normalized_x = 2*(x-self.l1low)/(self.l1up-self.l1low)-1
        return normalized_x