In [1]:
'''
@Author:
@Email: 
@Date: 2020-04-01 01:26:48
@LastEditTime: 2020-04-16 00:15:58
@Description: 
'''

import time, datetime
import copy
import os
import sys
import warnings
#warnings.filterwarnings("ignore", category=UserWarning)
#warnings.filterwarnings("ignore", category=RuntimeWarning)

import numpy as np
from loguru import logger
import yaml

import gym
sys.path.append('./envs/cartpole-envs')
sys.path.append('./envs/highway-env')
import cartpole_envs
#import highway_env

from utils import plot_reward, plot_index, dumb_reward_plot
from mpc.mpc_cp import MPC

# all models
# from dpgpmm.DPGPMM import DPGPMM
from baselines.SingleGP import SingleGP
# from baselines.SingleSparseGP import SingleSparseGP
# from baselines.NN import NN


def prepare_dynamics(gym_config):
    dynamics_name = gym_config['dynamics_name']
    seed = gym_config['seed']
    dynamics_set = []
    for i in range(len(dynamics_name)):
        env = gym.make(dynamics_name[i])
        # env.seed(seed)
        dynamics_set.append(gym.make(dynamics_name[i]))
    
    # use pre-defined env sequence
    task = [dynamics_set[i] for i in gym_config['task_dynamics_list']]
    return task


def load_config(config_path="config.yml"):
    if os.path.isfile(config_path):
        f = open(config_path)
        return yaml.load(f, Loader=yaml.FullLoader)
    else:
        raise Exception("Configuration file is not found in the path: "+config_path)



In [2]:

# dynamic model configuration
# config = load_config('config_DPGP_MBRL.yml')
config = load_config('./config/config_swingup.yml')
dpgp_config = config['DPGP_config']
gp_config = config['SingleGP_config']
sparse_gp_config = config['SingleSparseGP_config']
nn_config = config['NN_config']
mpc_config = config['mpc_config']
gym_config = config['gym_config']
render = gym_config['render']

# initialize the mixture model
# model = DPGPMM(dpgp_config=dpgp_config)
# model = SingleSparseGP(sparse_gp_config=sparse_gp_config)
model = SingleGP(gp_config=gp_config)
# model = NN(NN_config=nn_config)
logger.info('Using model: {}', model.name)

# initial MPC controller
mpc_controller = MPC(mpc_config=mpc_config)

# prepare task
# the task is solved, if each dynamic is solved
task = prepare_dynamics(gym_config)

"""start DPGP-MBRL"""
data_buffer = []
label_list = []
subtask_list = []
subtask_reward = []
subtask_succ_count = [0]
comp_trainable = [1]
task_reward = []
trainable = True
task_solved = False
subtask_solved = [False, False, False, False]
total_count = 0
task_epi = 0
log = []
log_name = None

# if model.name == 'NN':
#     pretrain_episodes = 10
#     print('pretrain~~~~~~~~~~~~~~~~~~~~~~~')
#     for task_idx in range(len(task)):
#         env = task[task_idx]
#         # data collection
#         for epi in range(pretrain_episodes):
#             obs = env.reset()
#             done = False
#             mpc_controller.reset()
#             while not done:
#                 action = env.action_space.sample()
#                 obs_next, reward, done, state_next = env.step(action)
#                 model.data_process([0, obs, action, obs_next - obs])
#                 obs = copy.deepcopy(obs_next)


2020-07-02 15:14:09.848 | INFO     | __main__:<module>:17 - Using model: SingleGP


In [4]:
while task_epi < 30:
    task_epi += 1
    time_task_0 = time.time()
    if total_count == 0:
        # for the first step, add one data pair with random policy as initialization
        state = task[0].reset()
        action = task[0].action_space.sample()
        state_next, reward, done, info = task[0].step(action)
        model.fit(data=[0, state, action, state_next-state])
        label_list.append(0)

    # for other steps, run DPGP MBRL
    # Different sub-tasks share the same action space
    # Note that the subtask_index is unknown to the model, it's for debugging
    task_r = 0
    for subtask_index in range(len(task)):
        m_p = np.random.uniform(0.3,0.7)
        l = np.random.uniform(0.3,0.7)
#         l = np.random.uniform(0.2,1.0)
        task[subtask_index].unwrapped.m_p = m_p
        task[subtask_index].unwrapped.l = l
        
        for epi in range(1): # each subtask contains a fixed number of episode
            O, A, R, acc_reward, done, V = [], [], [], 0, False, []

            print('subtask: ', subtask_index, ', epi: ', epi)
            time_subtask_0 = time.time()

            state = task[subtask_index].reset()
            O.append(state)
            # reset the controller at the beginning of each new dynamic
            mpc_controller.reset()

            while not done:
                if render:
                    task[subtask_index].render()

                total_count += 1
                label_list.append(subtask_index)

                # MPC policy
                start_1 = time.time()
                action = np.array([mpc_controller.act(task=task[subtask_index], model=model, state=state)])
                start_2 = time.time()

                # Random Policy
                # action = task[subtask_index].action_space.sample()

                # interact with env
                state_next, reward, done, violation = task[subtask_index].step(action)
                acc_reward += reward

#                 print('action ', action)
#                 print('reward: %.4f' % reward)

                A.append(action)
                O.append(state_next)
                R.append(reward)

                # logger.info('acc_reward : {}', acc_reward)
                start_3 = time.time()

                # train the model
                # when reach some kind of metric, stop training, only inference

                model.fit(data=[subtask_index, state, action, state_next - state])

                state = copy.deepcopy(state_next)
                start_4 = time.time()

                # print('mpc: {}, env: {}, model: {}'.format(start_2-start_1, start_3-start_2, start_4-start_3))
            print('pole_mass: ', m_p, 'pole_length: ', l,'step: ', i, 'acc_reward: ', acc_reward, 'violation_rate: ', sum(V)/len(V))
            env.close()

            if done:
                samples = {
                    "obs": np.array(O),
                    "actions": np.array(A),
                    "rewards": np.array(R),
                    "reward_sum": acc_reward,
                    "violations": np.array(V)
                }
                log.append(samples)
                if log_name is None:
                    log_name = datetime.datetime.now()
                path = './misc/log/gp_robust_' + log_name.strftime("%d-%H-%M") + '.npy'
                np.save(path, log, allow_pickle=True)
                dumb_reward_plot(path)
#                 if done:
#                     samples = {
#                         "obs": np.array(O),
#                         "actions": np.array(A),
#                         "rewards": np.array(R),
#                         "reward_sum": acc_reward,
#                     }
#                     log.append(samples)
#                     if log_name is None:
#                         log_name = datetime.datetime.now()
#                     path = './misc/log/CartPole-GP-'+ log_name.strftime("%d-%H-%M") + '.npy'
#                     np.save(path, log, allow_pickle=True)
#                     dumb_reward_plot(path)

#                     print('-------------------------------------------------')
#                     print('pole_mass', m_p, 'pole_length', l, 'Episode finished, time: ', time.time()-time_subtask_0, ' with acc_reward: ', acc_reward,
#                           ' with final reward: ', reward)
#                     print('-------------------------------------------------')
#                     subtask_list.append(subtask_index)
#                     subtask_reward.append(acc_reward)
#                     task_r += acc_reward
#                     if not model.name == 'DPGPMM':
#                         if len(subtask_succ_count) < subtask_index + 1:
#                             subtask_succ_count.append(0)
#                     if acc_reward >= 170:
#                         subtask_solved[subtask_index] = True
#                         print('-------------------------------------------------')
#                         print('Episode finished: Success!!!!, time: ', time.time()-time_subtask_0)
#                         print('-------------------------------------------------')
#                         subtask_list.append(subtask_index)
#                         subtask_reward.append(acc_reward)
#                         task_r += acc_reward
#                         # record succ rate
#                         if model.name == 'DPGPMM':
#                             subtask_succ_count[model.DP_mix.assigns[len(model.DP_mix.data) - 1]] += 1
#                         else:
#                             if len(subtask_succ_count) < subtask_index + 1:
#                                 subtask_succ_count.append(1)
#                             else:
#                                 subtask_succ_count[subtask_index] += 1

#             if model.name == 'DPGPMM':
#                 print('subtask_succ_count: ', subtask_succ_count)
#                 # todo: check the training termination criterion right or not
#                 for i in range(len(subtask_succ_count)):
#                     if subtask_succ_count[i] >= 10:
#                         comp_trainable[i] = 0
#             else:
#                 print('subtask_succ_count: ', subtask_succ_count)
#                 # todo: check the training termination criterion right or not
#                 all_solve = 0
#                 for i in range(len(subtask_succ_count)):
#                     if subtask_succ_count[i] >= 10:
#                         all_solve += 1
#                 if all_solve == 4:
#                     trainable = False
#             if render:
#                 task[subtask_index].close()




subtask:  0 , epi:  0


NameError: name 'i' is not defined

In [7]:
print(len(task))

1
