In [1]:
'''
@Author:
@Email: 
@Date: 2020-04-01 01:26:48
@LastEditTime: 2020-04-16 00:15:58
@Description: 
'''

import time, datetime
import copy
import os
import sys
import warnings
#warnings.filterwarnings("ignore", category=UserWarning)
#warnings.filterwarnings("ignore", category=RuntimeWarning)

import numpy as np
from loguru import logger
import yaml

import gym
sys.path.append('./envs/cartpole-envs')
sys.path.append('./envs/highway-env')
import cartpole_envs
#import highway_env

from utils import plot_reward, plot_index, dumb_reward_plot
from mpc.mpc_cp import MPC

# all models
# from dpgpmm.DPGPMM import DPGPMM
from baselines.SingleGP import SingleGP
# from baselines.SingleSparseGP import SingleSparseGP
# from baselines.NN import NN


def prepare_dynamics(gym_config):
    dynamics_name = gym_config['dynamics_name']
    seed = gym_config['seed']
    dynamics_set = []
    for i in range(len(dynamics_name)):
        env = gym.make(dynamics_name[i])
        # env.seed(seed)
        dynamics_set.append(gym.make(dynamics_name[i]))
    
    # use pre-defined env sequence
    task = [dynamics_set[i] for i in gym_config['task_dynamics_list']]
    return task


def load_config(config_path="config.yml"):
    if os.path.isfile(config_path):
        f = open(config_path)
        return yaml.load(f, Loader=yaml.FullLoader)
    else:
        raise Exception("Configuration file is not found in the path: "+config_path)



In [2]:

# dynamic model configuration
# config = load_config('config_DPGP_MBRL.yml')
config = load_config('./config/config_swingup.yml')
dpgp_config = config['DPGP_config']
gp_config = config['SingleGP_config']
sparse_gp_config = config['SingleSparseGP_config']
nn_config = config['NN_config']
mpc_config = config['mpc_config']
gym_config = config['gym_config']
render = gym_config['render']

# initialize the mixture model
# model = DPGPMM(dpgp_config=dpgp_config)
# model = SingleSparseGP(sparse_gp_config=sparse_gp_config)
model = SingleGP(gp_config=gp_config)
# model = NN(NN_config=nn_config)
logger.info('Using model: {}', model.name)

# initial MPC controller
mpc_controller = MPC(mpc_config=mpc_config)

# prepare task
# the task is solved, if each dynamic is solved
task = prepare_dynamics(gym_config)

"""start DPGP-MBRL"""
data_buffer = []
label_list = []
subtask_list = []
subtask_reward = []
subtask_succ_count = [0]
comp_trainable = [1]
task_reward = []
trainable = True
task_solved = False
subtask_solved = [False, False, False, False]
total_count = 0
task_epi = 0
log = []
log_name = None

# if model.name == 'NN':
#     pretrain_episodes = 10
#     print('pretrain~~~~~~~~~~~~~~~~~~~~~~~')
#     for task_idx in range(len(task)):
#         env = task[task_idx]
#         # data collection
#         for epi in range(pretrain_episodes):
#             obs = env.reset()
#             done = False
#             mpc_controller.reset()
#             while not done:
#                 action = env.action_space.sample()
#                 obs_next, reward, done, state_next = env.step(action)
#                 model.data_process([0, obs, action, obs_next - obs])
#                 obs = copy.deepcopy(obs_next)


2020-07-02 16:00:23.563 | INFO     | __main__:<module>:17 - Using model: SingleGP


In [6]:
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]
task_epi = 0
while task_epi < 200:
    task_epi += 1
    time_task_0 = time.time()
    if total_count == 0:
        # for the first step, add one data pair with random policy as initialization
        state = task[0].reset()
        action = task[0].action_space.sample()
        state_next, reward, done, info = task[0].step(action)
        model.fit(data=[0, state, action, state_next-state])
        label_list.append(0)

    # for other steps, run DPGP MBRL
    # Different sub-tasks share the same action space
    # Note that the subtask_index is unknown to the model, it's for debugging
    task_r = 0
    for subtask_index in range(len(task)):
        m_p = m_p_list[np.random.randint(2)]
        l = l_list[np.random.randint(2)]
#         l = np.random.uniform(0.2,1.0)
        task[subtask_index].unwrapped.m_p = m_p
        task[subtask_index].unwrapped.l = l
        
        for epi in range(1): # each subtask contains a fixed number of episode
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []

            print('subtask: ', subtask_index, ', epi: ', epi)
            time_subtask_0 = time.time()

            state = task[subtask_index].reset()
            O.append(state)
            # reset the controller at the beginning of each new dynamic
            mpc_controller.reset()
            i = 0
            while not done:
                i += 1
                if render:
                    task[subtask_index].render()

                total_count += 1
                label_list.append(subtask_index)

                # MPC policy
                start_1 = time.time()
                action = np.array([mpc_controller.act(task=task[subtask_index], model=model, state=state)])
                start_2 = time.time()

                # Random Policy
                # action = task[subtask_index].action_space.sample()

                # interact with env
                state_next, reward, done, violation = task[subtask_index].step(action)
                acc_reward += reward

#                 print('action ', action)
#                 print('reward: %.4f' % reward)

                A.append(action)
                O.append(state_next)
                R.append(reward)
                V.append(violation)

                # logger.info('acc_reward : {}', acc_reward)
                start_3 = time.time()

                # train the model
                # when reach some kind of metric, stop training, only inference

                model.fit(data=[subtask_index, state, action, state_next - state])

                state = copy.deepcopy(state_next)
                start_4 = time.time()

                # print('mpc: {}, env: {}, model: {}'.format(start_2-start_1, start_3-start_2, start_4-start_3))
            print('pole_mass: ', m_p, 'pole_length: ', l,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            task[subtask_index].close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violations": np.array(V)
                }
                log.append(samples)
                if log_name is None:
                    log_name = datetime.datetime.now()
                path = './misc/log/gp_robust_' + log_name.strftime("%d-%H-%M") + '.npy'
                np.save(path, log, allow_pickle=True)
                dumb_reward_plot(path)
#                 if done:
#                     samples = {
#                         "obs": np.array(O),
#                         "actions": np.array(A),
#                         "rewards": np.array(R),
#                         "reward_sum": acc_reward,
#                     }
#                     log.append(samples)
#                     if log_name is None:
#                         log_name = datetime.datetime.now()
#                     path = './misc/log/CartPole-GP-'+ log_name.strftime("%d-%H-%M") + '.npy'
#                     np.save(path, log, allow_pickle=True)
#                     dumb_reward_plot(path)

#                     print('-------------------------------------------------')
#                     print('pole_mass', m_p, 'pole_length', l, 'Episode finished, time: ', time.time()-time_subtask_0, ' with acc_reward: ', acc_reward,
#                           ' with final reward: ', reward)
#                     print('-------------------------------------------------')
#                     subtask_list.append(subtask_index)
#                     subtask_reward.append(acc_reward)
#                     task_r += acc_reward
#                     if not model.name == 'DPGPMM':
#                         if len(subtask_succ_count) < subtask_index + 1:
#                             subtask_succ_count.append(0)
#                     if acc_reward >= 170:
#                         subtask_solved[subtask_index] = True
#                         print('-------------------------------------------------')
#                         print('Episode finished: Success!!!!, time: ', time.time()-time_subtask_0)
#                         print('-------------------------------------------------')
#                         subtask_list.append(subtask_index)
#                         subtask_reward.append(acc_reward)
#                         task_r += acc_reward
#                         # record succ rate
#                         if model.name == 'DPGPMM':
#                             subtask_succ_count[model.DP_mix.assigns[len(model.DP_mix.data) - 1]] += 1
#                         else:
#                             if len(subtask_succ_count) < subtask_index + 1:
#                                 subtask_succ_count.append(1)
#                             else:
#                                 subtask_succ_count[subtask_index] += 1

#             if model.name == 'DPGPMM':
#                 print('subtask_succ_count: ', subtask_succ_count)
#                 # todo: check the training termination criterion right or not
#                 for i in range(len(subtask_succ_count)):
#                     if subtask_succ_count[i] >= 10:
#                         comp_trainable[i] = 0
#             else:
#                 print('subtask_succ_count: ', subtask_succ_count)
#                 # todo: check the training termination criterion right or not
#                 all_solve = 0
#                 for i in range(len(subtask_succ_count)):
#                     if subtask_succ_count[i] >= 10:
#                         all_solve += 1
#                 if all_solve == 4:
#                     trainable = False
#             if render:
#                 task[subtask_index].close()




subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  62.26636827446562 violation_rate:  0.105
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  72.43893868624312 violation_rate:  0.13
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  89.32001174075126 violation_rate:  0.13
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  69.36796178562388 violation_rate:  0.135
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  50.63889968672497 violation_rate:  0.03
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  66.18647839857552 violation_rate:  0.185
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  73.51658579494577 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  68.45619255891003 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  94.6011993194966 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  74.16306601941062 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  91.92438739704546 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  86.32355067632407 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  168 acc_reward:  47.64822166079517 violation_rate:  0.08333333333333333
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  91.9533245551284 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  78.14014560004516 violation_rate:  0.055
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  52.343707242813856 violation_rate:  0.165
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  61.85695950058952 violation_rate:  0.195
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  72.59584282989331 violation_rate:  0.015
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  56.99216411163172 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  47.850847317001765 violation_rate:  0.215
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  61.90824068257336 violation_rate:  0.1
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  62.92355314461723 violation_rate:  0.125
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  46.47669039119082 violation_rate:  0.1
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  63.6587053930593 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  41.18425609301605 violation_rate:  0.15
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  85.10054747974502 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  62.780976500366 violation_rate:  0.1
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  68.66475507768348 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  65.16254920924759 violation_rate:  0.1
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  75.82195708786121 violation_rate:  0.04
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  85.99823496264722 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  47.51629461549416 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  87.67513948039952 violation_rate:  0.03
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  48.8883164238663 violation_rate:  0.17
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  57.953838841309825 violation_rate:  0.135
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  62.37165794057451 violation_rate:  0.165
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  73.85644287738391 violation_rate:  0.065
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  77.39903670904545 violation_rate:  0.205
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  57.00557344296831 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  55.64597673909398 violation_rate:  0.155
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  65.6224263471416 violation_rate:  0.135
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  54.53695927483135 violation_rate:  0.13
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  56.18356677239605 violation_rate:  0.05
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  53.244395153475836 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  52.80046965674932 violation_rate:  0.075




subtask:  0 , epi:  0
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  58.08596468887019 violation_rate:  0.015
subtask:  0 , epi:  0
pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  66.70371688590542 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  54.920668299320035 violation_rate:  0.015
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  58.80345570446737 violation_rate:  0.07
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  64.2966386705459 violation_rate:  0.18
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  58.11230162258101 violation_rate:  0.085
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  68.20617689721816 violation_rate:  0.115
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  84 acc_reward:  29.61116691765225 violation_rate:  0.05952380952380952
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  67.4312778742567 violation_rate:  0.13
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  63.84444678140231 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  86.67732017145148 violation_rate:  0.04
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  102.37012464504575 violation_rate:  0.115
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  100.31969717080136 violation_rate:  0.09
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  102.01101584095672 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  95.88198723885807 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  103.80272418918067 violation_rate:  0.015
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  91.20939088744474 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  51.35615013792778 violation_rate:  0.095
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  57.8096475289906 violation_rate:  0.195
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  91.38425758192643 violation_rate:  0.055
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  74.44918783483482 violation_rate:  0.105
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  57.931323544001835 violation_rate:  0.155
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  96.28957845270045 violation_rate:  0.035
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  81.74841374411803 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  84.49233182416202 violation_rate:  0.065
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  100.98118470245093 violation_rate:  0.035
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  72.60021983889845 violation_rate:  0.1
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  84.33694674870755 violation_rate:  0.055
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  93.21365850726686 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  89.82081636462318 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  60.478880719829334 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  67.30675447263712 violation_rate:  0.17
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  97.10290715531463 violation_rate:  0.055
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  88.95415709162151 violation_rate:  0.04
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  53.44135062098316 violation_rate:  0.17
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  63.90719899446579 violation_rate:  0.155
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  57.45102843704823 violation_rate:  0.07
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  74.86198264876413 violation_rate:  0.05
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  80.4820513482383 violation_rate:  0.035
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  57.09098584898504 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  75.83914417624983 violation_rate:  0.02
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  50.52254784850315 violation_rate:  0.13
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  119.18628549044698 violation_rate:  0.02
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  95.36042520268376 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  71.6773415458478 violation_rate:  0.17
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  74.78747498291281 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  83.51127515160458 violation_rate:  0.14
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  78.36604456533544 violation_rate:  0.055
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  50.41166887829741 violation_rate:  0.105
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  66.60351845183958 violation_rate:  0.175
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  61.4796414684131 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  47.72676100108575 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  50.8574662835781 violation_rate:  0.09
subtask:  0 , epi:  0
pole_mass:  0.7 pole_length:  0.7 step:  82 acc_reward:  26.72571877069893 violation_rate:  0.06097560975609756
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  59.701307626768966 violation_rate:  0.045
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  66.49994378209568 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  93.26643381274756 violation_rate:  0.065
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  49.39064614128713 violation_rate:  0.145
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  126.6157371509136 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  96.50239973065987 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  73.0460929083975 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  60.84737911306079 violation_rate:  0.165
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  84.627997031878 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  56.21145768593741 violation_rate:  0.085
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  115.92829265842464 violation_rate:  0.02
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  54.54306476760888 violation_rate:  0.065
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  111.66455024174104 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  68.80349281979862 violation_rate:  0.075
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  62.807970177351095 violation_rate:  0.17
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  58.83218342987578 violation_rate:  0.02
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  118 acc_reward:  29.997035897755318 violation_rate:  0.05084745762711865
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  50.88740131396316 violation_rate:  0.15
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  33.38187757747589 violation_rate:  0.125
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  40.26388523969947 violation_rate:  0.06
subtask:  0 , epi:  0
pole_mass:  0.7 pole_length:  0.7 step:  82 acc_reward:  13.247798602735902 violation_rate:  0.08536585365853659
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  31.209877211282688 violation_rate:  0.085
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  88.93781038498133 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  56.17119027691876 violation_rate:  0.155
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  52.557842636335664 violation_rate:  0.035
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  77.6827195750999 violation_rate:  0.04
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  70.35160859295443 violation_rate:  0.065
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  78.48922462196707 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  88.99273261517908 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  67.02096621634078 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  74.32719767294213 violation_rate:  0.105
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  65.34679446493416 violation_rate:  0.16
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  62.8075199216669 violation_rate:  0.03
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  61.749897089982966 violation_rate:  0.15
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  79.69497180932498 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  122.69799925134437 violation_rate:  0.035
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  98.34385990790312 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  93.72181499404276 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  100.80619259138209 violation_rate:  0.05
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  71.14000240742284 violation_rate:  0.24
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  76.70084616999387 violation_rate:  0.08
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  59.540786941859686 violation_rate:  0.095
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  90.89332450740162 violation_rate:  0.085
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  58.1414792064132 violation_rate:  0.125
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  75.38463279990857 violation_rate:  0.13
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  45.01383501591486 violation_rate:  0.06
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  67.7224155642713 violation_rate:  0.215
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  38.712429424145064 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  40.349247795924704 violation_rate:  0.075
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  54.42750972867739 violation_rate:  0.14
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  71.18521385870531 violation_rate:  0.18
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  63.712803429643714 violation_rate:  0.09
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  73.19842136405403 violation_rate:  0.03
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  59.04626577277112 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  100.74021914737922 violation_rate:  0.09
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  67.65875960586851 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  49.07190167128632 violation_rate:  0.035
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  90.12827171305001 violation_rate:  0.05
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  88.5716351519669 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  101.89759157549949 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  60.33348101759021 violation_rate:  0.125
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  74.27851068103719 violation_rate:  0.18
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  70.2060923589025 violation_rate:  0.015
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  92.36321272514685 violation_rate:  0.055
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  66.31090231818781 violation_rate:  0.07
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  32.56844000975364 violation_rate:  0.1
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  56.08250880278703 violation_rate:  0.175
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  84.8177400443745 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  55.490398428640155 violation_rate:  0.195
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  71.8924018714505 violation_rate:  0.025
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  53.265139374813664 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  85.82505000801149 violation_rate:  0.015
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  71.29515497419573 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  48.5405450685683 violation_rate:  0.07
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  69.36252345587366 violation_rate:  0.03
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  126.95340918155227 violation_rate:  0.005
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  56.25201220114325 violation_rate:  0.085
subtask:  0 , epi:  0
pole_mass:  0.3 pole_length:  0.3 step:  57 acc_reward:  16.352362468364664 violation_rate:  0.017543859649122806
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  61.623414902348564 violation_rate:  0.235
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  71.88995224623378 violation_rate:  0.04
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  66.36372182232695 violation_rate:  0.17
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  68.31301082713588 violation_rate:  0.12
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  142 acc_reward:  48.12275786584308 violation_rate:  0.056338028169014086
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  94.25054291620316 violation_rate:  0.01
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  69.50807648721683 violation_rate:  0.015
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  50.77270228937626 violation_rate:  0.15
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  58.34254543308214 violation_rate:  0.195
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  63.3874626926718 violation_rate:  0.195
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  38.18460925689041 violation_rate:  0.115
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  47.621795125963075 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  68.35583126774051 violation_rate:  0.145
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  68.33900427993905 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  53.37838827991164 violation_rate:  0.11
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  66.49951571526063 violation_rate:  0.18
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  70.23762169083541 violation_rate:  0.24
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  58.928423998897664 violation_rate:  0.0
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  93.04784088248806 violation_rate:  0.105
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  59.45330935031076 violation_rate:  0.06
subtask:  0 , epi:  0




pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  82.90913925048736 violation_rate:  0.065
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  84.5089707494448 violation_rate:  0.03
subtask:  0 , epi:  0




pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  34.69204822018689 violation_rate:  0.055


In [32]:
np.random.randint(4)

0