In [1]:
import time, datetime
import copy
import os
import sys
import warnings
import torch
import numpy as np
from loguru import logger
import yaml
from utils import dumb_reward_plot
import gym
os.environ["CUDA_VISIBLE_DEVICES"]="1"
sys.path.append('./envs/cartpole-envs')
sys.path.append('./')
import cartpole_envs

from utils import plot_reward, plot_index
from mpc.mpc_cp import MPC
from baselines.NP_epi import NP

def prepare_dynamics(gym_config):
    dynamics_name = gym_config['dynamics_name']
    seed = gym_config['seed']
    dynamics_set = []
    for i in range(len(dynamics_name)):
        dynamics_set.append(gym.make(dynamics_name[i]))
    task = [dynamics_set[i] for i in gym_config['task_dynamics_list']]
    return task

def load_config(config_path="config.yml"):
    if os.path.isfile(config_path):
        f = open(config_path)
        return yaml.load(f, Loader=yaml.FullLoader)
    else:
        raise Exception("Configuration file is not found in the path: "+config_path)

In [3]:
# config = load_config('config/config_cpstable_np.yml')
config = load_config('config/config_swingup_robust.yml')
mpc_config = config['mpc_config']
gym_config = config['gym_config']
render = gym_config['render']
np_config = config['NP_config']

model = NP(NP_config=np_config)
model.model.load_state_dict(torch.load( './misc/log/robust_model_latent07-14-54.pth'))

logger.info('Using model: {}', model.name)

mpc_controller = MPC(mpc_config=mpc_config)

# prepare task
task = prepare_dynamics(gym_config)
# print(gym_config)

"""start DPGP-MBRL"""
data_buffer = []
label_list = []
subtask_list = []
subtask_reward = []
subtask_succ_count = [0]
comp_trainable = [1]
task_reward = []
trainable = True
task_solved = False
subtask_solved = [False, False, False, False]
total_count = 0
task_epi = 0
log_name = None

total_tasks = 1

2020-07-07 20:32:10.835 | INFO     | __main__:<module>:11 - Using model: NP


In [5]:
# log_name == None
"""testing the model with MPC while training """
test_episode = 1
test_epoch = 300
log = []
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]

for ep in range(test_epoch):
    for task_idx in range(1):
        task_steps = 0
        env = task[0]
#         m_p = m_p_list[np.random.randint(2)]
#         l = l_list[np.random.randint(2)]
        m_p = np.random.uniform(0.3, 0.7)
        l = np.random.uniform(0.3, 0.7)
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
#                 if task_steps > 2:
#                     model.train()
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
                if log_name is None:
                    log_name = datetime.datetime.now()
                path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 print(path)
                np.save(path, log, allow_pickle=True)
                dumb_reward_plot(path)
        model.reset()
        model.train()

pole_mass:  0.42159424496317416 pole_length:  0.5946032987013766 step:  50 acc_reward:  6.015337474486625 violation_rate:  0.24
pole_mass:  0.5070668966193925 pole_length:  0.6549498711821964 step:  200 acc_reward:  45.84626589665886 violation_rate:  0.185
pole_mass:  0.6714989408928683 pole_length:  0.5653388407917275 step:  200 acc_reward:  56.19540336212693 violation_rate:  0.2
pole_mass:  0.30979875951820507 pole_length:  0.4053991382810651 step:  200 acc_reward:  65.14718118139945 violation_rate:  0.25
pole_mass:  0.4991135141863573 pole_length:  0.5119395988514462 step:  200 acc_reward:  47.80208526817609 violation_rate:  0.195
pole_mass:  0.4404587433501503 pole_length:  0.41914865379328686 step:  200 acc_reward:  60.236490626513216 violation_rate:  0.275
pole_mass:  0.4404396633469533 pole_length:  0.3927983076683769 step:  200 acc_reward:  64.35555473600199 violation_rate:  0.25
pole_mass:  0.408743774284645 pole_length:  0.6611581617611527 step:  200 acc_reward:  61.134061526

pole_mass:  0.39006514741123355 pole_length:  0.3230351007369654 step:  200 acc_reward:  85.92534159243972 violation_rate:  0.05
pole_mass:  0.6652708604853528 pole_length:  0.6466359740787486 step:  200 acc_reward:  76.8782744401633 violation_rate:  0.1
pole_mass:  0.6846197113165325 pole_length:  0.48316012801587155 step:  200 acc_reward:  99.08523298915772 violation_rate:  0.0
pole_mass:  0.31136762362291825 pole_length:  0.42343109063829715 step:  200 acc_reward:  170.88986460079414 violation_rate:  0.095
pole_mass:  0.4827108685152066 pole_length:  0.651452066583874 step:  200 acc_reward:  59.23038203637748 violation_rate:  0.065
pole_mass:  0.682698762698696 pole_length:  0.511920617263578 step:  200 acc_reward:  119.6573758577895 violation_rate:  0.295
pole_mass:  0.5035567362619073 pole_length:  0.6287366844027942 step:  200 acc_reward:  43.48205614459425 violation_rate:  0.13
pole_mass:  0.32747675141374744 pole_length:  0.6410652853216325 step:  200 acc_reward:  92.0160595542

pole_mass:  0.6984775093129898 pole_length:  0.6358191398163585 step:  200 acc_reward:  168.87240984198863 violation_rate:  0.0
pole_mass:  0.4201493655767429 pole_length:  0.5298519640041127 step:  200 acc_reward:  128.94913000622856 violation_rate:  0.015
pole_mass:  0.659423542341693 pole_length:  0.32939664128440305 step:  200 acc_reward:  86.71126547279044 violation_rate:  0.095
pole_mass:  0.41042162712316155 pole_length:  0.6006645445144381 step:  200 acc_reward:  165.81754604700512 violation_rate:  0.09
pole_mass:  0.32821303060007617 pole_length:  0.3014477103437496 step:  117 acc_reward:  24.973855147128347 violation_rate:  0.07692307692307693
pole_mass:  0.6566323232250688 pole_length:  0.5483228655181719 step:  200 acc_reward:  143.08053849134507 violation_rate:  0.005
pole_mass:  0.531233776251215 pole_length:  0.4906877430014865 step:  200 acc_reward:  163.52667762619456 violation_rate:  0.08
pole_mass:  0.36240228403040775 pole_length:  0.6668267317414122 step:  200 acc_

pole_mass:  0.6983167230676762 pole_length:  0.4388356321319871 step:  200 acc_reward:  169.6298491156775 violation_rate:  0.005
pole_mass:  0.47867556646849796 pole_length:  0.6831162365159429 step:  200 acc_reward:  83.74932782660723 violation_rate:  0.02
pole_mass:  0.3717547221292343 pole_length:  0.601418186149985 step:  200 acc_reward:  171.81436994016485 violation_rate:  0.09
pole_mass:  0.5011931686236235 pole_length:  0.48805877913933493 step:  200 acc_reward:  121.31334464291186 violation_rate:  0.015
pole_mass:  0.4196231527935905 pole_length:  0.5648784612166053 step:  200 acc_reward:  47.13010672505579 violation_rate:  0.225
pole_mass:  0.508487541176959 pole_length:  0.6012997727398697 step:  200 acc_reward:  121.89233746793172 violation_rate:  0.025
pole_mass:  0.5171713953584751 pole_length:  0.43030916682993703 step:  200 acc_reward:  105.43593782138095 violation_rate:  0.0
pole_mass:  0.4063616497495348 pole_length:  0.4275366975256679 step:  200 acc_reward:  101.6749

KeyboardInterrupt: 

In [6]:
torch.save(model.model.state_dict(), './misc/log/robust_model_latent-' + log_name.strftime("%d-%H-%M") + '.pth')

In [7]:
print(path)

./misc/log/np_adaptation07-14-54.npy


In [4]:
config = load_config('config/config_swingup_robust.yml')
mpc_config = config['mpc_config']
mpc_controller = MPC(mpc_config=mpc_config)
"""testing the model with MPC while training """
test_episode = 1
test_epoch = 10
log = []
m_p_list = [0.4, 0.5, 0.6]
l_list = [0.4, 0.6]

for m_p in m_p_list:
    for l in l_list:
        task_steps = 0
        env = task[0]
#         m_p = m_p_list[np.random.randint(2)]
#         l = l_list[np.random.randint(2)]
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
#                 if log_name is None:
#                     log_name = datetime.datetime.now()
#                 path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 np.save(path, log, allow_pickle=True)
#                 dumb_reward_plot(path)
            
        model.reset()

pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  95.48802804479807 violation_rate:  0.045


KeyboardInterrupt: 