In [3]:
import numpy as np
import sympy as sp
import ipdb
import import_ipynb
from sklearn.preprocessing import StandardScaler
from Environment import Simple1Room
from Environment import offline_sim



class Stochastic_AC_PG:
    """
    a class for actor-critic stochastic policy-gradient RL
    
    a Stochastic_AC_PG object is used to implement an actor-critic 
    stochastic policy-gradient RL with eligibility traces that can 
    handle both types of average rewards: average reward per time step &
    average reward per step
    
    ...
    Attributes
    ----------
    Td: float
        the user-defined desired temperature
    learning_rate_policy_mu: float
        learning rate for policy mean. It should take a value in [0,1]
        (default 0.01)
    learning_rate_policy_sigma: float
        learning rate for policy variance. It should take a value in 
        [0,1] (default 0.01)
    learning_rate_valuefunc: float
        learning rate for state value function. It should take a value
        in [0,1] (default 0.01)
    learning_rate_avereward: float
        learning rate for average reward. It should take a value in
        [0,1] (default 0.01)
    elig_decay_policy: float
        eligibilit decay parameter for policy. It should take a value in
        [0,1] (default 0.2)
    elig_decay_valuefunc: float
        eligibilit decay parameter for state-value function. It should 
        take a value in [0,1] (default 0.2)
    T_upperbound: float
        upper limit of switch-OFF temperature. (default Td+5)
    T_lowerbound: float
        lower limit of switch-ON temperature. (default Td-5)
    PerTimeStep: bool
        if set to True average reward per time step is used, otherwise
        if set to False average reward per step is used (default True)
    reward_decay: float
        reward decay parameter. It should take a value in [0,1] 
        (default 1)
    n_features: int
        number of features for the feature vector (default 2)
    
    Methods
    -------
    initialize():
        initialized all the system parameters that we want to learn
    choose_action(state):
        chooses an action (temp. threshold) from a Gaussian distribution
    learn(S, A, Sp, R, delta_time):
        updates all the system parameters we want to learn
    value_func(state):
        evaluates the state-value function at a given state
    value_func_grad(state):
        evaluates the state-value function gradient wrt its parameters 
        at a given state
    policy_func_grad(state, A):
        evaluates the policy (its log) gradient wrt its parameters at a 
        given state and action
    feature_vec(cls,state):
    this outputs a feature vector
    
    """


    def __init__(self,
                 Td,
                 learning_rate_policy_mu=0.01,
                 learning_rate_policy_sigma=0.01,
                 learning_rate_valuefunc=0.01,
                 learning_rate_avereward=0.01,
                 elig_decay_policy=0.2,
                 elig_decay_valuefunc=0.2,
                 T_upperbound=None,
                 T_lowerbound=None,
                 PerTimeStep=True,
                 reward_decay=1,
                 n_features=2):

        self.Td = Td
        self.alpha_theta_mu = learning_rate_policy_mu
        self.alpha_theta_sigma = learning_rate_policy_sigma
        self.alpha_w = learning_rate_valuefunc
        self.eta = learning_rate_avereward
        self.gamma = reward_decay
        self.lambda_theta = elig_decay_policy
        self.lambda_w = elig_decay_valuefunc
        self.PerTimeStep = PerTimeStep
        self.n_features = n_features
        
        if T_upperbound is None:
            self.Tup = self.Td + 5
        else:
            self.Tup = T_upperbound

        if T_lowerbound is None:
            self.Tlow = self.Td - 5
        else:
            self.Tlow = T_lowerbound

    def initialize(self):
        """
        initializes all the parameters.
        
        initialized values:
        theta_sigma = 0.0
        theta_mu = [Td-3, Td+3]
        w_valuefunc = [np.random.uniform(low=-1, high=1.0, size=None),
                       np.random.uniform(low=-1, high=1.0, size=None)]
        Rave = 0.0
        
        all the initial values of eligibility traces are set to 0.0
        """
        
        # Initializing parameters of the policy (theta_mu = [theta_ON,
        # theta_OFF], theta_sigma) and value function (w=[w_OFF, w_ON]) as well
        # as the average reward (Rave)

        self.theta_sigma = 0.0
        theta_ON = self.Td - 4.0
        theta_OFF = self.Td + 4.0
        self.theta_mu = np.array([theta_ON, theta_OFF])
        self.z_theta_mu = np.array([0.0, 0.0])
        self.z_theta_sigma = 0.0
        self.z_w = np.array([0.0, 0.0])
        self.Rave = 0

        w_ON = np.random.uniform(low=-1, high=1.0, size=None)
        w_OFF = np.random.uniform(low=-1, high=1.0, size=None)
        self.w_valuefunc = np.array([w_OFF, w_ON])

        return self.Rave, self.theta_mu, self.theta_sigma, self.w_valuefunc

    def choose_action(self, state):
        """
        chooses an action from a Gaussian distribution
        
        an action, a temperature threshold, is sampled from a Gaussian 
        distribution whose mean and variance are calculated based on the
        theta_mu and theta_sigma and a feature state vector.
        """
        
        # choosing action (threshold temperatures)
        T, hs, aT, zT = state
        feature_vec = Stochastic_AC_PG.feature_vec(state)
        mu = np.dot(self.theta_mu, feature_vec)
        sigma = np.exp(self.theta_sigma)

        while True:
            action = np.random.normal(mu, sigma)
            if (hs == 1 and action > T
                    and action < self.Tup) or (hs == 0 and action < T
                                               and action > self.Tlow):
                break

        return action

    def learn(self, S, A, Sp, R, delta_time):
        """
        this function updates all the system parameters we want to learn
        
        this function takes the initial state (S) and the action taken 
        at S (A) as well as the next state (Sp) and the reward (R) and
        the transition time (delta_time), and use them all to update all
        the system parameters we want to learn (those initialized under
        the initialization function)
        """

        if self.PerTimeStep:
            delt = R - self.Rave * delta_time + self.value_func(
                Sp) - self.value_func(S)
            self.Rave = self.Rave + self.eta * delt / delta_time
        else:
            delt = R - self.Rave + self.value_func(Sp) - self.value_func(S)
            self.Rave = self.Rave + self.eta * delt

        dV_dw = self.value_func_grad(S)
        dlnPi_dtheta_mu = self.policy_func_grad(S, A)[0:2]
        dlnPi_dtheta_sigma = self.policy_func_grad(S, A)[-1]

        self.z_w = self.lambda_w * self.z_w + dV_dw
        self.z_theta_mu = self.lambda_theta * self.z_theta_mu + dlnPi_dtheta_mu
        self.z_theta_sigma = self.lambda_theta * self.z_theta_sigma +\
                              dlnPi_dtheta_sigma

        self.w_valuefunc = self.w_valuefunc + self.alpha_w * delt * self.z_w
        self.theta_mu = self.theta_mu +\
                        self.alpha_theta_mu * delt * self.z_theta_mu
        self.theta_sigma = self.theta_sigma +\
                           self.alpha_theta_sigma * delt * self.z_theta_sigma

        #         ipdb.set_trace()

        return self.Rave, self.theta_mu, self.theta_sigma, self.w_valuefunc

    def value_func(self, state):
        """
        this function evaluates the state-value function at a given
        state
        """
        
        T, hs, aT, zT = state
        feature_vec = Stochastic_AC_PG.feature_vec(state)
        return np.dot(self.w_valuefunc, feature_vec)

    def value_func_grad(self, state):
        """
        this function evaluates the state-value function gradient wrt
        its parameters at a given state
        """
        
        T, hs, aT, zT = state
        feature_vec = Stochastic_AC_PG.feature_vec(state)
        dVdw = feature_vec
        return dVdw

    def policy_func_grad(self, state, A):
        """
        this function evaluates the policy (its log) gradient wrt
        its parameters at a given state and action
        """
        
        T, hs, aT, zT = state
        feature_vec = Stochastic_AC_PG.feature_vec(state)
        mu = np.dot(self.theta_mu, feature_vec)
        sigma = np.exp(self.theta_sigma)

        #         ipdb.set_trace()

        m, s = sp.symbols('m s')
        lnPi = sp.log(
            (1 / (s * sp.sqrt(2 * sp.pi))) * sp.exp(-(A - m)**2 / (2 * s**2)))

        dlnPi_dmu = sp.diff(lnPi, m)
        dlnPi_dsigma = sp.diff(lnPi, s)

        dlnPi_dmu_calc = sp.lambdify((m, s), dlnPi_dmu, 'numpy')
        dlnPi_dsigma_calc = sp.lambdify((m, s), dlnPi_dsigma, 'numpy')

        dlnPi_dtheta = np.array([dlnPi_dmu_calc(mu, sigma) * (1-hs),\
                                 dlnPi_dmu_calc(mu, sigma) * hs,\
                                 dlnPi_dsigma_calc(mu, sigma) * sigma])
        return dlnPi_dtheta

    @classmethod
    def feature_vec(cls, state):
        """ this outputs a feature vector"""
        
        T, hs, aT, zT = state
        return np.array([1 - hs, hs])

importing Jupyter notebook from Environment.ipynb


In [None]:
class COPDAC_Q:
    """
    a class for compatible off-policy deterministic actor-critic RL 
    
    COPDAC_Q class is similar to COPDAC_Q0 with the added capability of
    handeling fixed-transition times.
    
    a COPDAC_Q object is used to implement compatible off-policy 
    deterministic actor-critic RL with simple Q-learning that can handle
    both types of average rewards: average reward per time step &
    average reward per step
    
    ...
    Attributes
    ----------
    Td: float
        the user-defined desired temperature
    learning_rate_policy_mu: float
        learning rate for policy. It should take a value in [0,1]
        (default 0.01)
    learning_rate_baselinefunc: float
        learning rate for baseline function. It should take a value in 
        [0,1] (default 0.01)
    learning_rate_actionvaluefunc: float
        learning rate for action-value function. It should take a value
        in [0,1] (default 0.01)
    learning_rate_avereward: float
        learning rate for average reward. It should take a value in
        [0,1] (default 0.01)
    policy_update_freq: int
        defines the policy update freq. The policy is updated every
        policy_update_freq times (default 1)
    T_upperbound: float
        upper limit of switch-OFF temperature. (default Td+5)
    T_lowerbound: float
        lower limit of switch-ON temperature. (default Td-5)
    theta_init: float
        initial values of [theta_ON, theta_OFF]. (default [Td-2, Td+2])
    PerTimeStep: bool
        if set to True, average reward per time step is used, otherwise
        if set to False, average reward per step is used (default True)
    reward_decay: float
        reward decay parameter. It should take a value in [0,1] 
        (default 1)
    
    
    Methods
    -------
    initialize():
        initializes all the system parameters that we want to learn
    choose_action(state):
        chooses a deterministic action (temp. threshold) with some added noise
        for exploration
    learn(S, A, Sp, R, delta_time):
        updates all the system parameters we want to learn
     symbols_list():
        defines symboles for the parameters of interes
    mu_func_sym():
        defines policy function symbolically
    mu_grad_sym():
        defines symbolically gradient of the policy function wrt its parameters
    V_func_sym():
        defines symbolically the baseline function
    V_grad_sym():
        defines symbolically gradient of the baseline function
    Q_func_sym():
        defines symbolically the action-value function
    Q_grad_a_sym():
        defines symbolically gradient (wrt action) of the action-value function
    Q_grad_w_sym():
        defines symbolically gradient (wrt its parameters) of the action-value function
        
    
    """

    # This is for 1Simple room with heating only and with capability of both 
    # variable and fixed transition time periods/intervals
    
    # in this class I coded in a more general way using symbolic form for
    # function and did everything symbolically using sympy package


    def __init__(self,
                 Td,
                 learning_rate_policy_mu=0.01,
                 learning_rate_baselinefunc=0.01,
                 learning_rate_actionvaluefunc=0.01,
                 learning_rate_avereward=0.01,
                 policy_update_freq=1,
                 T_upperbound=None,
                 T_lowerbound=None,
                 theta_init=None,
                 PerTimeStep=True,
                 reward_decay=1):

        self.Td = Td
        self.alpha_theta = learning_rate_policy_mu
        self.alpha_v = learning_rate_baselinefunc
        self.alpha_w = learning_rate_actionvaluefunc
        self.eta = learning_rate_avereward
        self.gamma = reward_decay
        self.PerTimeStep = PerTimeStep
        self.policy_update_freq = policy_update_freq
        

        if T_upperbound is None:
            self.Tup = self.Td + 5
        else:
            self.Tup = T_upperbound

        if T_lowerbound is None:
            self.Tlow = self.Td - 5
        else:
            self.Tlow = T_lowerbound

        if theta_init is None:
            self.theta_init = np.array([self.Td - 2.0, self.Td + 2.0])
        else:
            self.theta_init = np.asarray(theta_init)


    def initialize(self):
        """initializes parameters of interest"""
        
        # first we lambdify some of the useful functions
        self.mu, self.V, self.Q, self.dmu_dtheta, self.dV_dv, self.dQ_da, self.dQ_dw = self.lambdify_funcs(
        )

        self.theta = self.theta_init
        self.w = np.random.uniform(low=-1,
                                   high=1.0,
                                   size=self.mu_grad_sym().shape[0])
        self.v = np.random.uniform(low=-1,
                                   high=1.0,
                                   size=self.V_grad_sym().shape[0])
        self.Rave = 0.0
        self.policy_update_ind = 0

        return self.Rave, self.theta, self.w, self.v

    def choose_action(self, state):
        """
        samples an acceptable action (temp. threshold)
        
        action is chosen based on the deterministic policy with some 
        added noise for exploration. Here noise is chosen unformly 
        randomly from a range e.g. [-1.5 1.5]
        """
        
        # choosing action (threshold temperature and CO2 density)
        T, hs, aT, zT = state
        mu = self.mu(state, self.theta)

        while True:
            Tth = mu[0, 0] + np.random.uniform(
                low=-1.5, high=1.5, size=None)
            if (hs == 1 and Tth > T
                    and Tth <= self.Tup) or (hs == 0 and Tth < T
                                             and Tth >= self.Tlow):
                break

        action = Tth
        return action

    def learn(self, S, A, Sp, R, delta_time):
        """
        this function updates all the system parameters we want to learn
        
        this function takes the initial state (S) and the action taken 
        at S (A) as well as the next state (Sp), the reward (R), and
        the transition time (delta_time), and use them all to update all
        the system parameters we want to learn (those initialized under
        the initialization function)
        """

        self.policy_update_ind = self.policy_update_ind + 1

        V_S_v = self.V(S, self.v).item()
        V_Sp_v = self.V(Sp, self.v).item()
        Q_S_A_theta_v_w = self.Q(S, (A, ), self.theta, self.v, self.w).item()
        Q_Sp_muSp_theta_v_w = self.Q(Sp, (self.mu(Sp, self.theta), ),
                                     self.theta, self.v, self.w).item()
        dmu_dtheta = self.dmu_dtheta(S, self.theta)
        dQ_da = self.dQ_da(S, (A, ), self.theta, self.v, self.w)
        dQ_dw = self.dQ_dw(S, (A, ), self.theta, self.v, self.w)
        dV_dv = self.dV_dv(S, self.v)

        if self.PerTimeStep: # shouldn't you change it to "pertimestep"
            delt = R - self.Rave * delta_time + Q_Sp_muSp_theta_v_w - Q_S_A_theta_v_w
            self.Rave = self.Rave + self.eta * delt / delta_time
        else:
            delt = R - self.Rave + Q_Sp_muSp_theta_v_w - Q_S_A_theta_v_w
            self.Rave = self.Rave + self.eta * delt

        if self.policy_update_ind % self.policy_update_freq == 0:
            self.theta = self.theta + self.alpha_theta * np.matmul(
                dmu_dtheta, dQ_da).ravel()

        self.w = self.w + self.alpha_w * delt * dQ_dw.ravel()
        self.v = self.v + self.alpha_v * delt * dV_dv.ravel()

        return self.Rave, self.theta, self.w, self.v

    def symbols_list(self):
        """defines symboles for the parameters of interest"""
        
        s = sp.symbols('T, hs, aT, zT')
        theta = sp.symbols('theta1:3')
        v = sp.symbols('v1:4')
        a = sp.symbols('a1:2')
        w = sp.symbols('w1:{}'.format(len(theta) + 1))
        return s, theta, v, a, w

    def mu_func_sym(self):
        """defines policy function symbolically"""
        
        s, theta, v, a, w = self.symbols_list()
        T, hs, aT, zT = s
        mu = (theta[0] * (1 - hs) + theta[1] * hs) * zT + (aT) * (1 - zT)
        mu_vec = sp.Matrix([mu])
        return mu_vec

    def mu_grad_sym(self):
        """defines symbolically gradient of the policy function wrt its parameters"""
        
        s, theta, v, a, w = self.symbols_list()
        mu = self.mu_func_sym()
        theta_vec = sp.Matrix([item for item in theta])
        mu_jacob_theta = mu.jacobian(theta).T
        return mu_jacob_theta

    def V_func_sym(self):
        """defines symbolically the baseline function"""
        
        s, theta, v, a, w = self.symbols_list()
        T, hs, aT, zT = s
        v_vec = sp.Matrix([item for item in v])
        feature_vec = sp.Matrix([hs, 1 - hs, T - self.Td])
        V = v_vec.T * feature_vec
        return V

    def V_grad_sym(self):
        """defines symbolically gradient of the baseline function"""
        
        s, theta, v, a, w = self.symbols_list()
        V = self.V_func_sym()
        v_vec = sp.Matrix([item for item in v])
        V_grad = V.jacobian(v_vec).T
        return V_grad

    def Q_func_sym(self):
        """defines symbolically the action-value function"""
        
        s, theta, v, a, w = self.symbols_list()
        a = sp.Matrix([item for item in a])
        w = sp.Matrix([item for item in w])
        V = self.V_func_sym()
        mu = self.mu_func_sym()
        dmu_dtheta = self.mu_grad_sym()
        Q = (a - mu).T * dmu_dtheta.T * w + V
        return Q

    def Q_grad_a_sym(self):
        """defines symbolically gradient (wrt action) of the action-value function"""
        
        s, theta, v, a, w = self.symbols_list()
        a = sp.Matrix([item for item in a])
        Q = self.Q_func_sym()
        dQ_da = Q.jacobian(a).T
        return dQ_da

    def Q_grad_w_sym(self):
        """
        defines symbolically gradient (wrt its parameters) of the action-value
        function
        
        """
        
        s, theta, v, a, w = self.symbols_list()
        w = sp.Matrix([item for item in w])
        Q = self.Q_func_sym()
        dQ_dw = Q.jacobian(w).T
        return dQ_dw

    def lambdify_funcs(self):
        """Lambdifies the functions of interest with numpy as the target library"""
        
        s, theta, v, a, w = self.symbols_list()
        mu_lambdified = sp.lambdify([s, theta], self.mu_func_sym(), 'numpy')
        V_lambdified = sp.lambdify([s, v], self.V_func_sym(), 'numpy')
        Q_lambdified = sp.lambdify([s, a, theta, v, w], self.Q_func_sym(),
                                   'numpy')
        mu_grad_lambdified = sp.lambdify([s, theta], self.mu_grad_sym(),
                                         'numpy')
        V_grad_lambdified = sp.lambdify([s, v], self.V_grad_sym(), 'numpy')
        Q_grad_a_lambdified = sp.lambdify([s, a, theta, v, w],
                                          self.Q_grad_a_sym(), 'numpy')
        Q_grad_w_lambdified = sp.lambdify([s, a, theta, v, w],
                                          self.Q_grad_w_sym(), 'numpy')
        return mu_lambdified, V_lambdified, Q_lambdified, mu_grad_lambdified,\
               V_grad_lambdified, Q_grad_a_lambdified, Q_grad_w_lambdified

In [1]:
import import_ipynb
import numpy as np
import sympy as sp
from Environment import Simple1Room
class COPDAC_Q_Ton:
    """
    a class for compatible off-policy deterministic actor-critic RL 
    
    COPDAC_Q_Ton class is similar to COPDAC_Q with only optimizing for Ton (Toff is constant and given).
    
    a COPDAC_Q object is used to implement compatible off-policy 
    deterministic actor-critic RL with simple Q-learning that can handle
    both types of average rewards: average reward per time step &
    average reward per step
    
    ...
    Attributes
    ----------
    Td: float
        the user-defined desired temperature
    learning_rate_policy_mu: float
        learning rate for policy. It should take a value in [0,1]
        (default 0.01)
    learning_rate_baselinefunc: float
        learning rate for baseline function. It should take a value in 
        [0,1] (default 0.01)
    learning_rate_actionvaluefunc: float
        learning rate for action-value function. It should take a value
        in [0,1] (default 0.01)
    learning_rate_avereward: float
        learning rate for average reward. It should take a value in
        [0,1] (default 0.01)
    policy_update_freq: int
        defines the policy update freq. The policy is updated every
        policy_update_freq times (default 1)
    T_upperbound: float
        upper limit of switch-OFF temperature. (default Td+5)
    T_lowerbound: float
        lower limit of switch-ON temperature. (default Td-5)
    theta_init: float
        initial values of [theta_ON, theta_OFF]. (default [Td-2, Td+2])
    PerTimeStep: bool
        if set to True, average reward per time step is used, otherwise
        if set to False, average reward per step is used (default True)
    reward_decay: float
        reward decay parameter. It should take a value in [0,1] 
        (default 1)
    
    
    Methods
    -------
    initialize():
        initializes all the system parameters that we want to learn
    choose_action(state):
        chooses a deterministic action (temp. threshold) with some added noise
        for exploration
    learn(S, A, Sp, R, delta_time):
        updates all the system parameters we want to learn
     symbols_list():
        defines symboles for the parameters of interes
    mu_func_sym():
        defines policy function symbolically
    mu_grad_sym():
        defines symbolically gradient of the policy function wrt its parameters
    V_func_sym():
        defines symbolically the baseline function
    V_grad_sym():
        defines symbolically gradient of the baseline function
    Q_func_sym():
        defines symbolically the action-value function
    Q_grad_a_sym():
        defines symbolically gradient (wrt action) of the action-value function
    Q_grad_w_sym():
        defines symbolically gradient (wrt its parameters) of the action-value function
        
    
    """

    # This is for 1Simple room with heating only and with capability of both 
    # variable and fixed transition time periods/intervals
    
    # in this class I coded in a more general way using symbolic form for
    # function and did everything symbolically using sympy package


    def __init__(self,
                 Td,
                 Toff = 16.0,
                 learning_rate_policy_mu=0.01,
                 learning_rate_baselinefunc=0.01,
                 learning_rate_actionvaluefunc=0.01,
                 learning_rate_avereward=0.01,
                 policy_update_freq=1,
                 T_upperbound=None,
                 T_lowerbound=None,
                 theta_init=None,
                 PerTimeStep=True,
                 reward_decay=1):

        self.Td = Td
        self.alpha_theta = learning_rate_policy_mu
        self.alpha_v = learning_rate_baselinefunc
        self.alpha_w = learning_rate_actionvaluefunc
        self.eta = learning_rate_avereward
        self.gamma = reward_decay
        self.PerTimeStep = PerTimeStep
        self.policy_update_freq = policy_update_freq
        
        self.Toff = Toff
        

        if T_upperbound is None:
            self.Tup = self.Td + 5
        else:
            self.Tup = T_upperbound

        if T_lowerbound is None:
            self.Tlow = self.Td - 7
        else:
            self.Tlow = T_lowerbound

        if theta_init is None:
            self.theta_init = np.array([self.Td - 5.0])
        else:
            self.theta_init = np.asarray(theta_init)


    def initialize(self):
        """initializes parameters of interest"""
        
        # first we lambdify some of the useful functions
        self.mu, self.V, self.Q, self.dmu_dtheta, self.dV_dv, self.dQ_da, self.dQ_dw = self.lambdify_funcs(
        )

        self.theta = self.theta_init
        self.w = np.random.uniform(low=-1,
                                   high=1.0,
                                   size=self.mu_grad_sym().shape[0])
        self.v = np.random.uniform(low=-1,
                                   high=1.0,
                                   size=self.V_grad_sym().shape[0])
        self.Rave = 0.0
        self.policy_update_ind = 0

        return self.Rave, self.theta, self.w, self.v

    def choose_action(self, state):
        """
        samples an acceptable action (temp. threshold)
        
        action is chosen based on the deterministic policy with some 
        added noise for exploration. Here noise is chosen unformly 
        randomly from a range e.g. [-1.5 1.5]
        """
        
        # choosing action (threshold temperature and CO2 density)
        T, hs, aT, zT = state
        mu = self.mu(state, self.theta)

        while True:
            Tth = mu[0, 0] + np.random.uniform(
                low=-1.5, high=1.5, size=None)
            if (hs == 1 and Tth > T
                    and Tth <= self.Tup) or (hs == 0 and Tth < T
                                             and Tth >= self.Tlow):
                break

        action = Tth
        return action

    def learn(self, S, A, Sp, R, delta_time):
        """
        this function updates all the system parameters we want to learn
        
        this function takes the initial state (S) and the action taken 
        at S (A) as well as the next state (Sp), the reward (R), and
        the transition time (delta_time), and use them all to update all
        the system parameters we want to learn (those initialized under
        the initialization function)
        """

        self.policy_update_ind = self.policy_update_ind + 1

        V_S_v = self.V(S, self.v).item()
        V_Sp_v = self.V(Sp, self.v).item()
        Q_S_A_theta_v_w = self.Q(S, (A, ), self.theta, self.v, self.w).item()
        Q_Sp_muSp_theta_v_w = self.Q(Sp, (self.mu(Sp, self.theta), ),
                                     self.theta, self.v, self.w).item()
        dmu_dtheta = self.dmu_dtheta(S, self.theta)
        dQ_da = self.dQ_da(S, (A, ), self.theta, self.v, self.w)
        dQ_dw = self.dQ_dw(S, (A, ), self.theta, self.v, self.w)
        dV_dv = self.dV_dv(S, self.v)

        if self.PerTimeStep: # shouldn't you change it to "pertimestep"
            delt = R - self.Rave * delta_time + Q_Sp_muSp_theta_v_w - Q_S_A_theta_v_w
            self.Rave = self.Rave + self.eta * delt / delta_time
        else:
            delt = R - self.Rave + Q_Sp_muSp_theta_v_w - Q_S_A_theta_v_w
            self.Rave = self.Rave + self.eta * delt

        if self.policy_update_ind % self.policy_update_freq == 0:
            self.theta = self.theta + self.alpha_theta * np.matmul(
                dmu_dtheta, dQ_da).ravel()

        self.w = self.w + self.alpha_w * delt * dQ_dw.ravel()
        self.v = self.v + self.alpha_v * delt * dV_dv.ravel()

        return self.Rave, self.theta, self.w, self.v

    def symbols_list(self):
        """defines symboles for the parameters of interest"""
        
        s = sp.symbols('T, hs, aT, zT')
        theta = sp.symbols('theta1:2')
        #         v = sp.symbols('v1:3')
        v = sp.symbols('v1:4')
        a = sp.symbols('a1:2')
        w = sp.symbols('w1:{}'.format(len(theta) + 1))
        return s, theta, v, a, w

    def mu_func_sym(self):
        """defines policy function symbolically"""
        
        s, theta, v, a, w = self.symbols_list()
        T, hs, aT, zT = s
        mu = (theta[0] * (1 - hs) + self.Toff * hs) * zT + (aT) * (1 - zT)
        mu_vec = sp.Matrix([mu])
        return mu_vec

    def mu_grad_sym(self):
        """defines symbolically gradient of the policy function wrt its parameters"""
        
        s, theta, v, a, w = self.symbols_list()
        mu = self.mu_func_sym()
        theta_vec = sp.Matrix([item for item in theta])
        mu_jacob_theta = mu.jacobian(theta).T
        return mu_jacob_theta

    def V_func_sym(self):
        """defines symbolically the baseline function"""
        
        s, theta, v, a, w = self.symbols_list()
        T, hs, aT, zT = s
        v_vec = sp.Matrix([item for item in v])
        #         feature_vec = sp.Matrix([hs, (1-hs)])
        feature_vec = sp.Matrix([hs, 1 - hs, T - self.Td])
        V = v_vec.T * feature_vec
        return V

    def V_grad_sym(self):
        """defines symbolically gradient of the baseline function"""
        
        s, theta, v, a, w = self.symbols_list()
        V = self.V_func_sym()
        v_vec = sp.Matrix([item for item in v])
        V_grad = V.jacobian(v_vec).T
        return V_grad

    def Q_func_sym(self):
        """defines symbolically the action-value function"""
        
        s, theta, v, a, w = self.symbols_list()
        a = sp.Matrix([item for item in a])
        w = sp.Matrix([item for item in w])
        V = self.V_func_sym()
        mu = self.mu_func_sym()
        dmu_dtheta = self.mu_grad_sym()
        Q = (a - mu).T * dmu_dtheta.T * w + V
        return Q

    def Q_grad_a_sym(self):
        """defines symbolically gradient (wrt action) of the action-value function"""
        
        s, theta, v, a, w = self.symbols_list()
        a = sp.Matrix([item for item in a])
        Q = self.Q_func_sym()
        dQ_da = Q.jacobian(a).T
        return dQ_da

    def Q_grad_w_sym(self):
        """
        defines symbolically gradient (wrt its parameters) of the action-value
        function
        
        """
        
        s, theta, v, a, w = self.symbols_list()
        w = sp.Matrix([item for item in w])
        Q = self.Q_func_sym()
        dQ_dw = Q.jacobian(w).T
        return dQ_dw

    def lambdify_funcs(self):
        """Lambdifies the functions of interest with numpy as the target library"""
        
        s, theta, v, a, w = self.symbols_list()
        mu_lambdified = sp.lambdify([s, theta], self.mu_func_sym(), 'numpy')
        V_lambdified = sp.lambdify([s, v], self.V_func_sym(), 'numpy')
        Q_lambdified = sp.lambdify([s, a, theta, v, w], self.Q_func_sym(),
                                   'numpy')
        mu_grad_lambdified = sp.lambdify([s, theta], self.mu_grad_sym(),
                                         'numpy')
        V_grad_lambdified = sp.lambdify([s, v], self.V_grad_sym(), 'numpy')
        Q_grad_a_lambdified = sp.lambdify([s, a, theta, v, w],
                                          self.Q_grad_a_sym(), 'numpy')
        Q_grad_w_lambdified = sp.lambdify([s, a, theta, v, w],
                                          self.Q_grad_w_sym(), 'numpy')
        return mu_lambdified, V_lambdified, Q_lambdified, mu_grad_lambdified,\
               V_grad_lambdified, Q_grad_a_lambdified, Q_grad_w_lambdified

importing Jupyter notebook from Environment.ipynb
