In [1]:
import time, datetime
import copy
import os
import sys
import warnings
import torch
import numpy as np
from loguru import logger
import yaml
from utils import dumb_reward_plot
import gym
os.environ["CUDA_VISIBLE_DEVICES"]="0"
sys.path.append('./envs/cartpole-envs')
sys.path.append('./')
import cartpole_envs

from utils import plot_reward, plot_index
from mpc.mpc_cp import MPC
from baselines.NP_epi import NP

def prepare_dynamics(gym_config):
    dynamics_name = gym_config['dynamics_name']
    seed = gym_config['seed']
    dynamics_set = []
    for i in range(len(dynamics_name)):
        dynamics_set.append(gym.make(dynamics_name[i]))
    task = [dynamics_set[i] for i in gym_config['task_dynamics_list']]
    return task

def load_config(config_path="config.yml"):
    if os.path.isfile(config_path):
        f = open(config_path)
        return yaml.load(f, Loader=yaml.FullLoader)
    else:
        raise Exception("Configuration file is not found in the path: "+config_path)

In [2]:
# config = load_config('config/config_cpstable_np.yml')
config = load_config('config/config_swingup_robust.yml')
mpc_config = config['mpc_config']
gym_config = config['gym_config']
render = gym_config['render']
np_config = config['NP_config']

model = NP(NP_config=np_config)
logger.info('Using model: {}', model.name)

mpc_controller = MPC(mpc_config=mpc_config)

# prepare task
task = prepare_dynamics(gym_config)
# print(gym_config)

"""start DPGP-MBRL"""
data_buffer = []
label_list = []
subtask_list = []
subtask_reward = []
subtask_succ_count = [0]
comp_trainable = [1]
task_reward = []
trainable = True
task_solved = False
subtask_solved = [False, False, False, False]
total_count = 0
task_epi = 0
log_name = None

total_tasks = 1

2020-07-07 17:12:19.689 | INFO     | __main__:<module>:9 - Using model: NP


In [3]:
"""NP pretrain"""
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]
pretrain_episodes = 1
for task_idx in range(10):
    env = task[0]
#     m_p = m_p_list[np.random.randint(2)]
#     l = l_list[np.random.randint(2)]
    m_p = np.random.uniform(0.3, 0.7)
    l = np.random.uniform(0.3, 0.7)
    env.unwrapped.m_p = m_p
    env.unwrapped.l = l
    for epi in range(pretrain_episodes):
        obs = env.reset()
        done = False
        mpc_controller.reset()
        i = 0
        while not done:
            i += 1
            action = env.action_space.sample()
            obs_next, reward, done, _ = env.step(action)
            model.data_process([0, obs, action, obs_next - obs])
#             if i > 3:
#                 model.train()
            obs = obs_next
    model.reset()
    model.train()
# torch.save(model.model.state_dict(), './misc/log/model_test.pth')

In [4]:
# model2 = NP(NP_config=np_config)
# model2.model.load_state_dict(torch.load( './misc/log/model_test.pth'))

In [None]:
# log_name == None
"""testing the model with MPC while training """
test_episode = 1
test_epoch = 300
log = []
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]

for ep in range(test_epoch):
    for task_idx in range(1):
        task_steps = 0
        env = task[0]
#         m_p = m_p_list[np.random.randint(2)]
#         l = l_list[np.random.randint(2)]
        m_p = np.random.uniform(0.3, 0.7)
        l = np.random.uniform(0.3, 0.7)
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
#                 if task_steps > 2:
#                     model.train()
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
                if log_name is None:
                    log_name = datetime.datetime.now()
                path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 print(path)
                np.save(path, log, allow_pickle=True)
                dumb_reward_plot(path)
        model.reset()
        model.train()

pole_mass:  0.6444561713555839 pole_length:  0.40269517328413584 step:  200 acc_reward:  8.058455484034104 violation_rate:  0.0
pole_mass:  0.31800205678250876 pole_length:  0.6598690930009012 step:  200 acc_reward:  11.359121118294404 violation_rate:  0.045
pole_mass:  0.47425849080156735 pole_length:  0.6000070508708291 step:  200 acc_reward:  47.161480789723186 violation_rate:  0.1
pole_mass:  0.337123378654993 pole_length:  0.5750115351113498 step:  200 acc_reward:  23.563826472750723 violation_rate:  0.02
pole_mass:  0.5154667488169822 pole_length:  0.692585671241827 step:  200 acc_reward:  54.51951743824596 violation_rate:  0.15
pole_mass:  0.4358735061307504 pole_length:  0.3503362747532297 step:  200 acc_reward:  40.193319971282484 violation_rate:  0.055
pole_mass:  0.5807264512588632 pole_length:  0.6416040780394315 step:  200 acc_reward:  40.328024240202275 violation_rate:  0.03
pole_mass:  0.5552989261499768 pole_length:  0.4565785847959813 step:  200 acc_reward:  51.5488319

pole_mass:  0.5800335595764814 pole_length:  0.5020465471764977 step:  200 acc_reward:  65.41007667772541 violation_rate:  0.0
pole_mass:  0.6041062787239677 pole_length:  0.6868801746074387 step:  200 acc_reward:  59.389238257135055 violation_rate:  0.03
pole_mass:  0.6151888158356036 pole_length:  0.6564979160235828 step:  200 acc_reward:  92.97368769140677 violation_rate:  0.0
pole_mass:  0.5272346600437731 pole_length:  0.5893082526104159 step:  200 acc_reward:  120.61682577297138 violation_rate:  0.0
pole_mass:  0.42324229070632813 pole_length:  0.5632904137917669 step:  200 acc_reward:  95.36425761809949 violation_rate:  0.0
pole_mass:  0.6549338359100891 pole_length:  0.4527207399563733 step:  200 acc_reward:  144.5238505975788 violation_rate:  0.005
pole_mass:  0.4887750503850905 pole_length:  0.4154072596469818 step:  200 acc_reward:  96.23959688611477 violation_rate:  0.0
pole_mass:  0.42528402562880485 pole_length:  0.3598450164004036 step:  200 acc_reward:  54.9321619676758

pole_mass:  0.312015104407229 pole_length:  0.6374390839099946 step:  200 acc_reward:  72.19585973887355 violation_rate:  0.0
pole_mass:  0.4909175121010897 pole_length:  0.32761268960583545 step:  200 acc_reward:  70.4102803603366 violation_rate:  0.06
pole_mass:  0.3918629794725008 pole_length:  0.5799266661756931 step:  200 acc_reward:  100.27358239454567 violation_rate:  0.01
pole_mass:  0.471729866084706 pole_length:  0.3470245236243288 step:  200 acc_reward:  64.06582107531341 violation_rate:  0.02
pole_mass:  0.6907457526358756 pole_length:  0.6178923808969148 step:  200 acc_reward:  55.60452347259415 violation_rate:  0.0
pole_mass:  0.3673137639417971 pole_length:  0.574326584135527 step:  200 acc_reward:  62.49439875166003 violation_rate:  0.015
pole_mass:  0.48069695315656924 pole_length:  0.48056890257479484 step:  200 acc_reward:  176.23399591105368 violation_rate:  0.0
pole_mass:  0.3570197677734288 pole_length:  0.6937693379205644 step:  200 acc_reward:  74.2125851698579 

pole_mass:  0.6230482371171967 pole_length:  0.4414464027537346 step:  200 acc_reward:  104.41917735339509 violation_rate:  0.0
pole_mass:  0.6117215847095708 pole_length:  0.5933934456927377 step:  200 acc_reward:  151.3854980929227 violation_rate:  0.0


In [1]:
torch.save(model.model.state_dict(), './misc/log/robust_model_' + log_name.strftime("%d-%H-%M") + '.pth')

NameError: name 'torch' is not defined

In [7]:
print(path)

./misc/log/np_adaptation06-16-13.npy


In [8]:
config = load_config('config/config_swingup_robust.yml')
mpc_config = config['mpc_config']
mpc_controller = MPC(mpc_config=mpc_config)
"""testing the model with MPC while training """
test_episode = 1
test_epoch = 10
log = []
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]

for ep in range(test_epoch):
    for task_idx in range(1):
        task_steps = 0
        env = task[0]
        m_p = m_p_list[np.random.randint(2)]
        l = l_list[np.random.randint(2)]
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
#                 if log_name is None:
#                     log_name = datetime.datetime.now()
#                 path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 np.save(path, log, allow_pickle=True)
#                 dumb_reward_plot(path)
            
        model.reset()

pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  155.37174407349997 violation_rate:  0.035
pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  152.98799105235113 violation_rate:  0.09
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  111.36387528293744 violation_rate:  0.0
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  96.15357172916583 violation_rate:  0.285
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  155.69317472118897 violation_rate:  0.095
pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  158.11743615910416 violation_rate:  0.035
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  137.99629515002746 violation_rate:  0.235
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  97.34392621617955 violation_rate:  0.0
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  123.25752336064429 violation_rate:  0.22
pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  173.97169360055324 violation_rate:  0.0


In [4]:
# only_prior_model
test_episode = 1
test_epoch = 10
log = []
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]

for ep in range(test_epoch):
    for task_idx in range(1):
        task_steps = 0
        env = task[0]
        m_p = m_p_list[np.random.randint(2)]
        l = l_list[np.random.randint(2)]
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
#                 if log_name is None:
#                     log_name = datetime.datetime.now()
#                 path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 np.save(path, log, allow_pickle=True)
#                 dumb_reward_plot(path)
            
        model.reset()

pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  63.502598919684274 violation_rate:  0.085
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  91.83178738314267 violation_rate:  0.075
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  76.59254432443666 violation_rate:  0.045
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  51.18483559010456 violation_rate:  0.095
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  41.23595679727575 violation_rate:  0.0
pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  62.29221715500605 violation_rate:  0.06
pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  68.62416004089802 violation_rate:  0.07
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  112.86228850228765 violation_rate:  0.005
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  59.913114562337505 violation_rate:  0.05
pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  35.878112404024925 violation_rate:  0.04


In [14]:
import random
a= [0, 1, 2]
s = random.sample(a, 2)
s

[0, 1]

In [10]:
from collections import deque
import random
a = deque(maxlen=100)
xx = [1,2,3]
yy = [4,5,6]
a.append((xx,yy))
a.append((yy,xx))
s = random.sample(a, k=1)
s

[([4, 5, 6], [1, 2, 3])]

In [12]:
(x, y) = s[0]

In [13]:
x

[4, 5, 6]