In [None]:
def train(env, RL, max_simulation_days = 20, max_steps = 500):
    
    S = env.reset()
    Rave, theta_mu, theta_sigma, w_valuefunc = RL.initialize()
    
    Rave_stepseries = []
    theta_mu_stepseries = [] 
    theta_sigma_stepseries = [] 
    w_valuefunc_stepseries = []
    Rave_stepseries.append(Rave)
    theta_mu_stepseries.append(theta_mu)
    theta_sigma_stepseries.append(theta_sigma)
    w_valuefunc_stepseries.append(w_valuefunc)


    G_stepseries = []
    T_stepseries = []
    t_stepseries = []
    hs_stepseries = []
    G_timeseries = []
    T_timeseries = []
    t_timeseries = []
    G_stepseries.append(0)
    T_stepseries.append(S[0])
    t_stepseries.append(0)
    hs_stepseries.append(S[1])
    G_timeseries.append(0)
    T_timeseries.append(S[0])
    t_timeseries.append(0)


    for step_iter in range(max_steps):

        A = RL.choose_action(state = S)

        Sp, R, T_StepTimeSeries, reward_StepTimeSeries, dt_step, t_StepTimeSeries, done = env.step(A)

        Rave, theta_mu, theta_sigma, w_valuefunc = RL.learn(S, A, Sp, R, dt_step)


        S = Sp
        Rave_stepseries.append(Rave)
        theta_mu_stepseries.append(theta_mu)
        theta_sigma_stepseries.append(theta_sigma)
        w_valuefunc_stepseries.append(w_valuefunc)


        T_stepseries.append(S[0])
        hs_stepseries.append(S[1])
        G_stepseries.append(G_stepseries[-1] + R)
        t_stepseries.append(t_stepseries[-1] + dt_step)

        T_timeseries.extend(T_StepTimeSeries)
        G_timeseries.extend(map(lambda x:x+G_stepseries[-2], reward_StepTimeSeries))
        t_timeseries.extend(map(lambda x:x+t_stepseries[-2], t_StepTimeSeries))

        if done:
            print ('It took more than time_out ({} hr) to transition from the last state to a new state'.format(env.time_out/3600))                                                                                
            break

        if t_timeseries[-1]/(24*3600) > max_simulation_days:
            print ('maximum simulation period ({} days) is reached'.format(max_simulation_days))
            break
    
    return Rave_stepseries, theta_mu_stepseries, theta_sigma_stepseries, w_valuefunc_stepseries, t_stepseries, t_timeseries,\
           T_stepseries, T_timeseries, hs_stepseries, G_stepseries, G_timeseries


In [None]:
def train_deterministic(env, RL, max_simulation_days = 20, max_steps = 500):
    
    S = env.reset()
    
    Rave, theta, w, v = RL.initialize()
    
    Rave_stepseries = []
    theta_stepseries = [] 
    w_stepseries = [] 
    v_stepseries = []
    Rave_stepseries.append(Rave)
    theta_stepseries.append(theta)
    w_stepseries.append(w)
    v_stepseries.append(v)


    G_stepseries = []
    T_stepseries = []
    t_stepseries = []
    hs_stepseries = []
    G_timeseries = []
    T_timeseries = []
    t_timeseries = []
    G_stepseries.append(0)
    T_stepseries.append(S[0])
    t_stepseries.append(0)
    hs_stepseries.append(S[1])
    G_timeseries.append(0)
    T_timeseries.append(S[0])
    t_timeseries.append(0)


    for step_iter in range(max_steps):

        A = RL.choose_action(state = S)

        Sp, R, T_StepTimeSeries, reward_StepTimeSeries, dt_step, t_StepTimeSeries, done = env.step(A)

        Rave, theta, w, v = RL.learn(S, A, Sp, R, dt_step)


        S = Sp
        Rave_stepseries.append(Rave)
        theta_stepseries.append(theta)
        w_stepseries.append(w)
        v_stepseries.append(v)


        T_stepseries.append(S[0])
        hs_stepseries.append(S[1])
        G_stepseries.append(G_stepseries[-1] + R)
        t_stepseries.append(t_stepseries[-1] + dt_step)

        T_timeseries.extend(T_StepTimeSeries)
        G_timeseries.extend(map(lambda x:x+G_stepseries[-2], reward_StepTimeSeries))
        t_timeseries.extend(map(lambda x:x+t_stepseries[-2], t_StepTimeSeries))

        if done:
            print ('It took more than time_out ({} hr) to transition from the last state to a new state in the training'.format(env.time_out/3600))                                                                                
            break

        if t_timeseries[-1]/(24*3600) > max_simulation_days:
            print ('maximum simulation period for training ({} days) is reached'.format(max_simulation_days))
            break
    
    return Rave_stepseries, theta_stepseries, w_stepseries, v_stepseries, t_stepseries, t_timeseries,\
           T_stepseries, T_timeseries, hs_stepseries, G_stepseries, G_timeseries

In [1]:
def train_deterministic_Vent(env, RL, max_simulation_days = 20, max_steps = 500):
    
    S = env.reset()
    Rave, theta, w, v = RL.initialize()
    
    Rave_stepseries = []
    theta_stepseries = [] 
    w_stepseries = [] 
    v_stepseries = []
    Rave_stepseries.append(Rave)
    theta_stepseries.append(theta)
    w_stepseries.append(w)
    v_stepseries.append(v)

    G_stepseries = []
    T_stepseries = []
    ro_stepseries = []
    t_stepseries = []
    hs_stepseries = []
    vs_stepseries = []
    G_timeseries = []
    T_timeseries = []
    ro_timeseries = []
    t_timeseries = []

    G_stepseries.append(0)
    T_stepseries.append(S[0])
    ro_stepseries.append(S[1])
    t_stepseries.append(0)
    hs_stepseries.append(S[2])
    vs_stepseries.append(S[3])
    G_timeseries.append(0)
    T_timeseries.append(S[0])
    ro_timeseries.append(S[1])
    t_timeseries.append(0)
    
#     def choose_action(state):
#         T, ro, hs, vs, zT, zro, aT, aro = state
#         mu1 = (13*(1-hs)+17*hs)*zT + aT * (1-zT)
#         mu2 = (900*(1-vs)+400*vs)*zro + aro*(1-zro)
#         action = (mu1,mu2)
#         return action

    for step_iter in range(max_steps):
        
        print('before choose action')
        A = RL.choose_action(state = S)
        print('after choose action')
        
#         print('iteration: {}'.format(step_iter))
#         print('S, A, Sp')
#         print('S: T, ro, hs, vs, zT, zro, aT, aro')
#         print(S)
#         print(A)
        
#         A = choose_action(state = S)
        Sp, R, T_StepTimeSeries, reward_StepTimeSeries, ro_StepTimeSeries, dt_step, t_StepTimeSeries, done = env.step(A)
        
#         print(Sp)
    
        
        Rave, theta, w, v = RL.learn(S, A, Sp, R, dt_step)
        
#         print('hello')
        
        S = Sp
        
        Rave_stepseries.append(Rave)
        theta_stepseries.append(theta)
        w_stepseries.append(w)
        v_stepseries.append(v)
        
        T_stepseries.append(S[0])
        ro_stepseries.append(S[1])
        hs_stepseries.append(S[2])
        vs_stepseries.append(S[3])
        G_stepseries.append(G_stepseries[-1] + R)
        t_stepseries.append(t_stepseries[-1] + dt_step)

        T_timeseries.extend(T_StepTimeSeries)
        ro_timeseries.extend(ro_StepTimeSeries)
        G_timeseries.extend(map(lambda x:x+G_stepseries[-2], reward_StepTimeSeries))
        t_timeseries.extend(map(lambda x:x+t_stepseries[-2], t_StepTimeSeries))
        
        if done:
            print ('It took more than time_out ({} hr) to transition from the last state to a new state'.format(env.time_out/3600))                                                                                
            break

        if t_timeseries[-1]/(24*3600) > max_simulation_days:
            print ('maximum simulation period ({} days) is reached'.format(max_simulation_days))
            break
        
#         print('hello2')
        print("--------------------------------------------------------")
    
    return Rave_stepseries, theta_stepseries, w_stepseries, v_stepseries,\
           t_stepseries, t_timeseries, ro_timeseries, vs_stepseries,\
           T_stepseries, T_timeseries, hs_stepseries, G_stepseries, G_timeseries
    