In [1]:
import time, datetime
import copy
import os
import sys
import warnings
import torch
import numpy as np
from loguru import logger
import yaml
from utils import dumb_reward_plot
import gym
os.environ["CUDA_VISIBLE_DEVICES"]="1"
sys.path.append('./envs/cartpole-envs')
sys.path.append('./')
import cartpole_envs

from utils import plot_reward, plot_index
from mpc.mpc_cp import MPC
from baselines.NP_epi import NP

def prepare_dynamics(gym_config):
    dynamics_name = gym_config['dynamics_name']
    seed = gym_config['seed']
    dynamics_set = []
    for i in range(len(dynamics_name)):
        dynamics_set.append(gym.make(dynamics_name[i]))
    task = [dynamics_set[i] for i in gym_config['task_dynamics_list']]
    return task

def load_config(config_path="config.yml"):
    if os.path.isfile(config_path):
        f = open(config_path)
        return yaml.load(f, Loader=yaml.FullLoader)
    else:
        raise Exception("Configuration file is not found in the path: "+config_path)

In [2]:
# config = load_config('config/config_cpstable_np.yml')
config = load_config('config/config_swingup_robust.yml')
mpc_config = config['mpc_config']
gym_config = config['gym_config']
render = gym_config['render']
np_config = config['NP_config']

model = NP(NP_config=np_config)
logger.info('Using model: {}', model.name)

mpc_controller = MPC(mpc_config=mpc_config)

# prepare task
task = prepare_dynamics(gym_config)
# print(gym_config)

"""start DPGP-MBRL"""
data_buffer = []
label_list = []
subtask_list = []
subtask_reward = []
subtask_succ_count = [0]
comp_trainable = [1]
task_reward = []
trainable = True
task_solved = False
subtask_solved = [False, False, False, False]
total_count = 0
task_epi = 0
log_name = None

total_tasks = 1

2020-07-10 12:16:13.946 | INFO     | __main__:<module>:9 - Using model: NP


In [3]:
"""NP pretrain"""
m_p_list = [0.2, 0.8]
l_list = [0.2, 0.8]
pretrain_episodes = 1
for task_idx in range(10):
    env = task[0]
#     m_p = m_p_list[np.random.randint(2)]
#     l = l_list[np.random.randint(2)]
    m_p = np.random.uniform(0.2, 0.8)
    l = np.random.uniform(0.2, 0.8)
    env.unwrapped.m_p = m_p
    env.unwrapped.l = l
    for epi in range(pretrain_episodes):
        obs = env.reset()
        done = False
        mpc_controller.reset()
        i = 0
        while not done:
            i += 1
            action = env.action_space.sample()
            obs_next, reward, done, _ = env.step(action)
            model.data_process([0, obs, action, obs_next - obs])
#             if i > 3:
#                 model.train()
            obs = obs_next
    model.reset()
    model.train()
# torch.save(model.model.state_dict(), './misc/log/model_test.pth')

In [4]:
# model2 = NP(NP_config=np_config)
# model2.model.load_state_dict(torch.load( './misc/log/model_test.pth'))

In [5]:
# log_name == None
"""testing the model with MPC while training """
test_episode = 1
test_epoch = 300
log = []
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]

for ep in range(test_epoch):
    for task_idx in range(1):
        task_steps = 0
        env = task[0]
#         m_p = m_p_list[np.random.randint(2)]
#         l = l_list[np.random.randint(2)]
        m_p = np.random.uniform(0.2, 0.8)
        l = np.random.uniform(0.2, 0.8)
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
#                 if task_steps > 2:
#                     model.train()
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
                if log_name is None:
                    log_name = datetime.datetime.now()
                path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 print(path)
                np.save(path, log, allow_pickle=True)
                dumb_reward_plot(path)
        model.reset()
        model.train()

pole_mass:  0.2682254192904986 pole_length:  0.7641647079072098 step:  23 acc_reward:  1.6610537930618485 violation_rate:  0.043478260869565216
pole_mass:  0.6537160875425138 pole_length:  0.5236700044899385 step:  200 acc_reward:  42.63532544004265 violation_rate:  0.355
pole_mass:  0.2284311950140213 pole_length:  0.7679035138193029 step:  200 acc_reward:  57.58432079876446 violation_rate:  0.21
pole_mass:  0.6782989919885076 pole_length:  0.7149772960808691 step:  200 acc_reward:  52.82021983051892 violation_rate:  0.255
pole_mass:  0.6490533078236105 pole_length:  0.34687072195328705 step:  29 acc_reward:  0.9288738579676027 violation_rate:  0.2413793103448276
pole_mass:  0.37413233296570403 pole_length:  0.73065513468654 step:  200 acc_reward:  38.367752268599205 violation_rate:  0.185
pole_mass:  0.4750200443195147 pole_length:  0.4529553489139125 step:  200 acc_reward:  24.53680210745485 violation_rate:  0.32
pole_mass:  0.538948216843983 pole_length:  0.660247194025112 step:  2

pole_mass:  0.6802054784903289 pole_length:  0.22140637972527985 step:  200 acc_reward:  33.942604181483176 violation_rate:  0.05
pole_mass:  0.24558387193196612 pole_length:  0.46184738700388994 step:  200 acc_reward:  150.5160968953595 violation_rate:  0.215
pole_mass:  0.4211325333635755 pole_length:  0.4071982996891522 step:  200 acc_reward:  46.19562494926389 violation_rate:  0.0
pole_mass:  0.4829184118754236 pole_length:  0.7320819562758691 step:  200 acc_reward:  107.51539568971091 violation_rate:  0.015
pole_mass:  0.6594079217953484 pole_length:  0.41919501169798035 step:  200 acc_reward:  78.1709756690576 violation_rate:  0.0
pole_mass:  0.2804910887489263 pole_length:  0.4898683741842365 step:  200 acc_reward:  159.96963773096252 violation_rate:  0.11
pole_mass:  0.6417247727399011 pole_length:  0.5988153935733902 step:  200 acc_reward:  84.07872984475182 violation_rate:  0.0
pole_mass:  0.5848481535843549 pole_length:  0.7061656926652515 step:  200 acc_reward:  41.67433505

pole_mass:  0.7474302260735208 pole_length:  0.44473655245733523 step:  200 acc_reward:  148.4300366759846 violation_rate:  0.125
pole_mass:  0.7000939640436392 pole_length:  0.6989913868207087 step:  200 acc_reward:  40.777164547205004 violation_rate:  0.19
pole_mass:  0.5413670496472931 pole_length:  0.35325914618395704 step:  200 acc_reward:  88.2517403452875 violation_rate:  0.04
pole_mass:  0.4798207189394212 pole_length:  0.47824296421907747 step:  200 acc_reward:  87.6000483398073 violation_rate:  0.01
pole_mass:  0.2508457105397381 pole_length:  0.278113960503952 step:  200 acc_reward:  88.86590994822316 violation_rate:  0.015
pole_mass:  0.20385666304830402 pole_length:  0.21682103244647932 step:  63 acc_reward:  7.064291551693475 violation_rate:  0.015873015873015872
pole_mass:  0.282279657548258 pole_length:  0.28012198999836724 step:  200 acc_reward:  73.35170228623063 violation_rate:  0.04
pole_mass:  0.38619408378191167 pole_length:  0.7317927658249497 step:  200 acc_rewa

pole_mass:  0.30508389898524424 pole_length:  0.7045367833208667 step:  200 acc_reward:  128.90567153934458 violation_rate:  0.025
pole_mass:  0.43692434996799173 pole_length:  0.36135925030521193 step:  200 acc_reward:  134.14939201393707 violation_rate:  0.255
pole_mass:  0.5738523527603065 pole_length:  0.3089002851580625 step:  200 acc_reward:  105.83847697570978 violation_rate:  0.07
pole_mass:  0.45311449269489473 pole_length:  0.78418445133733 step:  200 acc_reward:  142.7674911588145 violation_rate:  0.13
pole_mass:  0.46908550985939573 pole_length:  0.4641925277000066 step:  200 acc_reward:  47.279441960280515 violation_rate:  0.0
pole_mass:  0.20494751062682887 pole_length:  0.7877513487398655 step:  200 acc_reward:  164.0435163928172 violation_rate:  0.045
pole_mass:  0.20321292168086663 pole_length:  0.5348023492533192 step:  200 acc_reward:  161.59921853946474 violation_rate:  0.37
pole_mass:  0.4520940350389533 pole_length:  0.37269967789390546 step:  200 acc_reward:  133

pole_mass:  0.6029155283238954 pole_length:  0.7722967875266495 step:  200 acc_reward:  61.967077600045926 violation_rate:  0.0
pole_mass:  0.34221429174467993 pole_length:  0.5613684071325417 step:  200 acc_reward:  162.36126590099474 violation_rate:  0.02
pole_mass:  0.4973523131281714 pole_length:  0.45662349100219674 step:  200 acc_reward:  151.63553773190273 violation_rate:  0.22
pole_mass:  0.21967017431971225 pole_length:  0.2782494907811556 step:  200 acc_reward:  95.12253065125175 violation_rate:  0.0
pole_mass:  0.24671144556804753 pole_length:  0.5004253108568328 step:  200 acc_reward:  128.48445567339058 violation_rate:  0.23
pole_mass:  0.47487682648679225 pole_length:  0.43973887570311443 step:  200 acc_reward:  74.71456233674343 violation_rate:  0.01
pole_mass:  0.5371754940848892 pole_length:  0.6639859089110451 step:  200 acc_reward:  160.31458324347358 violation_rate:  0.115
pole_mass:  0.43744110781960993 pole_length:  0.7269899831785911 step:  200 acc_reward:  117.2

In [7]:
torch.save(model.model.state_dict(), './misc/log/robust_model_latent-' + log_name.strftime("%d-%H-%M") + '.pth')

In [8]:
print(path)

./misc/log/np_adaptation10-09-22.npy


In [8]:
config = load_config('config/config_swingup_robust.yml')
mpc_config = config['mpc_config']
mpc_controller = MPC(mpc_config=mpc_config)
"""testing the model with MPC while training """
test_episode = 1
test_epoch = 10
log = []
m_p_list = [0.3, 0.7]
l_list = [0.3, 0.7]

for ep in range(test_epoch):
    for task_idx in range(1):
        task_steps = 0
        env = task[0]
        m_p = m_p_list[np.random.randint(2)]
        l = l_list[np.random.randint(2)]
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
#                 if log_name is None:
#                     log_name = datetime.datetime.now()
#                 path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 np.save(path, log, allow_pickle=True)
#                 dumb_reward_plot(path)
            
        model.reset()

pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  155.37174407349997 violation_rate:  0.035
pole_mass:  0.3 pole_length:  0.3 step:  200 acc_reward:  152.98799105235113 violation_rate:  0.09
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  111.36387528293744 violation_rate:  0.0
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  96.15357172916583 violation_rate:  0.285
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  155.69317472118897 violation_rate:  0.095
pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  158.11743615910416 violation_rate:  0.035
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  137.99629515002746 violation_rate:  0.235
pole_mass:  0.7 pole_length:  0.3 step:  200 acc_reward:  97.34392621617955 violation_rate:  0.0
pole_mass:  0.3 pole_length:  0.7 step:  200 acc_reward:  123.25752336064429 violation_rate:  0.22
pole_mass:  0.7 pole_length:  0.7 step:  200 acc_reward:  173.97169360055324 violation_rate:  0.0


In [9]:
# only_prior_model
test_episode = 1
test_epoch = 10
log = []
m_p_list = [0.25, 0.4, 0.6, 0.75]
l_list = [0.25, 0.4, 0.6, 0.75]
for l in l_list:
    for m_p in m_p_list:
#     for l in l_list:
        task_steps = 0
        env = task[0]
#         m_p = m_p_list[np.random.randint(2)]
#         l = l_list[np.random.randint(2)]
        env.unwrapped.m_p = m_p
        env.unwrapped.l = l
        for epi in range(test_episode):
            acc_reward = 0
            obs = env.reset()
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []
            mpc_controller.reset()
            i = 0
            while not done:
                i+= 1
                env_copy = prepare_dynamics(gym_config)[0]
                env_copy.unwrapped.m_p = m_p
                env_copy.unwrapped.l = l
                env_copy.reset()
                if task_steps > 0:
                    action = np.array([mpc_controller.act(task=env_copy, model=model, state=obs, ground_truth=True)])
                else:
                    action = np.array([0.0])
                obs_next, reward, done, violation = env.step(action)
                task_steps += 1
                A.append(action)
                O.append(obs_next)
                R.append(reward)
                V.append(violation)

                model.data_process([0, obs, action, obs_next - obs])
                obs = obs_next
                acc_reward += reward
#             print('task: ', task_idx,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            print('pole_mass: ', m_p, 'pole_length: ', l, 'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violation_rate": sum(V)/len(V)
                }
                log.append(samples)
#                 if log_name is None:
#                     log_name = datetime.datetime.now()
#                 path = './misc/log/np_adaptation' + log_name.strftime("%d-%H-%M") + '.npy'
#                 np.save(path, log, allow_pickle=True)
#                 dumb_reward_plot(path)
            
        model.reset()

pole_mass:  0.25 pole_length:  0.25 step:  200 acc_reward:  21.051632777875387 violation_rate:  0.09
pole_mass:  0.4 pole_length:  0.25 step:  200 acc_reward:  36.31640757246856 violation_rate:  0.08
pole_mass:  0.6 pole_length:  0.25 step:  200 acc_reward:  35.74430377746248 violation_rate:  0.13
pole_mass:  0.75 pole_length:  0.25 step:  200 acc_reward:  51.05605037326064 violation_rate:  0.065
pole_mass:  0.25 pole_length:  0.4 step:  200 acc_reward:  155.9412494983333 violation_rate:  0.015
pole_mass:  0.4 pole_length:  0.4 step:  200 acc_reward:  87.03557698739318 violation_rate:  0.015
pole_mass:  0.6 pole_length:  0.4 step:  200 acc_reward:  129.75732806755954 violation_rate:  0.075
pole_mass:  0.75 pole_length:  0.4 step:  200 acc_reward:  163.10355475050773 violation_rate:  0.025
pole_mass:  0.25 pole_length:  0.6 step:  200 acc_reward:  173.775165547779 violation_rate:  0.01
pole_mass:  0.4 pole_length:  0.6 step:  200 acc_reward:  173.2729320840893 violation_rate:  0.005
pol

In [14]:
import random
a= [0, 1, 2]
s = random.sample(a, 2)
s

[0, 1]

In [10]:
from collections import deque
import random
a = deque(maxlen=100)
xx = [1,2,3]
yy = [4,5,6]
a.append((xx,yy))
a.append((yy,xx))
s = random.sample(a, k=1)
s

[([4, 5, 6], [1, 2, 3])]