In [1]:
import os
# https://discuss.pytorch.org/t/how-to-change-the-default-device-of-gpu-device-ids-0/1041/24
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]=f"{1}"

import torch
print(f"current_device:{torch.cuda.current_device()}")
print(f"device_count:{torch.cuda.device_count()}")
device = torch.device("cuda")


import sys
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from ddpg import DDPGagent
from ppo.PPO_continuous import PPO,Memory
# from ddpg.utils import NormalizedEnv
from flexipod_env import FlexipodEnv

env = FlexipodEnv(dof = 12)
# env = NormalizedEnv(env)

############## Hyperparameters ##############
# env_name = "BipedalWalker-v3"
env_name = "flexipod"
render = True
solved_reward = 1500        # stop training if avg_reward > solved_reward
log_interval = 80           # print avg reward in the interval
# log_interval = 2           # print avg reward in the interval

max_episodes = 20000        # max training episodes
max_timesteps = 1500        # max timesteps in one episode

# update_timestep = 4000      # update policy every n timesteps
update_timestep = 3000      # update policy every n timesteps


action_std = 0.5            # constant std for action distribution (Multivariate Normal)
# action_std = 1.0          # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0002                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = None
#############################################
# creating environment
# env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

current_device:0
device_count:1


In [2]:
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/soft12dof_experiment_7')

In [3]:
# env = FlexipodEnv(dof = 12)
# self = env
# msg_rec,_,_,_ = env.step()

In [2]:
if random_seed:
    print("Random Seed: {}".format(random_seed))
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)

checkpoint = ppo.load(f'./PPO_continuous_{env_name}_best.pth')
max_avg_length = checkpoint["avg_length"]

# logging variables
running_reward = 0
avg_length = 0
# max_avg_length = 0
time_step = 0

# training loop
for i_episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        time_step +=1
        # Running policy_old:
        action = ppo.select_action(state, memory)
        state, reward, done, _ = env.step(action)

        # Saving reward and is_terminals:
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        # update if its time
        if time_step % update_timestep == 0:
            ppo.update(memory)
            memory.clear_memory()
            time_step = 0
        running_reward += reward
        if render:
            env.render()
        if done:
            break

    avg_length += t

    # save every 500 episodes
    if i_episode % 500 == 0:
        ppo.save(f'./PPO_continuous_{env_name}.pth',avg_length=avg_length)

    # logging
    if i_episode % log_interval == 0:
        avg_length = avg_length/log_interval
        running_reward = running_reward/log_interval
        writer.add_scalar("avg_length/train", avg_length, i_episode)
        writer.add_scalar("running_reward/train", running_reward, i_episode)
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            ppo.save(f'./PPO_continuous_solved_{env_name}.pth',avg_length=avg_length)
            break
            
        if avg_length>max_avg_length:
            max_avg_length = avg_length
            ppo.save(f'./PPO_continuous_{env_name}_best.pth',avg_length=avg_length)
        elif np.random.random()<0.1:# 50% chance 
            checkpoint = ppo.load(f'./PPO_continuous_{env_name}_best.pth')
            print(f"load old best,avg_length={checkpoint['avg_length']}")# restart

        print(f'Episode {i_episode} \t Avg length: {avg_length:.0f} \t Avg reward: {running_reward:.0f}')
        running_reward = 0
        avg_length = 0
        
env.pause()

0.0002 (0.9, 0.999)


KeyboardInterrupt: 

In [13]:
# writer.add_scalar("baseline_length/train", 200, 0)
# writer.add_scalar("baseline_length/train", 200, log_interval)

In [5]:
memory_2 = Memory()

# action_std=.5
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
ppo.load(f'./PPO_continuous_{env_name}_best.pth')

time_steps = []
durations = []
for k in range(20):
    state = env.reset()
    for t in range(max_timesteps):
        # Running policy_old:
        action = ppo.select_action(state, memory_2)
#         action = ppo.select_action(state, memory)
        state, reward, done, info = env.step(action)
#         state, reward, done, _ = env.step()

        if done:
            time_steps.append(t)
            episode_duration= info['episode_duration']
            durations.append(episode_duration)
            print(f"time steps:{t} ,duration: {episode_duration:.2f}[s], fps:{t/episode_duration:.0f}")
            break
            
print(f"mean time steps:{np.mean(time_steps)} ,duration: {np.mean(durations)}[s]")

time steps:296 ,duration: 3.52[s], fps:84
time steps:519 ,duration: 6.02[s], fps:86
time steps:279 ,duration: 3.27[s], fps:85
time steps:542 ,duration: 6.14[s], fps:88
time steps:311 ,duration: 3.55[s], fps:88
time steps:291 ,duration: 3.37[s], fps:86
time steps:456 ,duration: 5.12[s], fps:89
time steps:327 ,duration: 3.76[s], fps:87
time steps:180 ,duration: 2.14[s], fps:84
time steps:490 ,duration: 5.62[s], fps:87
time steps:329 ,duration: 3.82[s], fps:86
time steps:214 ,duration: 2.51[s], fps:85
time steps:250 ,duration: 2.95[s], fps:85
time steps:308 ,duration: 3.56[s], fps:87
time steps:371 ,duration: 4.16[s], fps:89
time steps:405 ,duration: 4.60[s], fps:88
time steps:213 ,duration: 2.37[s], fps:90
time steps:274 ,duration: 3.05[s], fps:90
time steps:412 ,duration: 4.80[s], fps:86
time steps:417 ,duration: 4.82[s], fps:87
mean time steps:344.2 ,duration: 3.9572517395019533[s]
