In [1]:
import os
# https://discuss.pytorch.org/t/how-to-change-the-default-device-of-gpu-device-ids-0/1041/24
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]=f"{0}"

import torch
print(f"current_device:{torch.cuda.current_device()}")
print(f"device_count:{torch.cuda.device_count()}")
device = torch.device("cuda:0")


import sys
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from ddpg import DDPGagent
from ppo.PPO_continuous import PPO,Memory
# from ddpg.utils import NormalizedEnv
from flexipod_env import FlexipodEnv

env = FlexipodEnv(dof = 12)
# env = NormalizedEnv(env)

############## Hyperparameters ##############
# env_name = "BipedalWalker-v3"
env_name = "flexipod"
render = True
solved_reward = 1500        # stop training if avg_reward > solved_reward
log_interval = 80           # print avg reward in the interval
# log_interval = 2           # print avg reward in the interval

max_episodes = 20000        # max training episodes
max_timesteps = 1500        # max timesteps in one episode

# update_timestep = 4000      # update policy every n timesteps
update_timestep = 3000      # update policy every n timesteps


# action_std = 1.0            # constant std for action distribution (Multivariate Normal)
action_std = 0.8            # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0002                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = None
#############################################
# creating environment
# env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

current_device:0
device_count:1


In [5]:
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/soft12dof_experiment_16')

In [6]:
# env = FlexipodEnv(dof = 12)
# self = env
# msg_rec,_,_,_ = env.step()

In [None]:
if random_seed:
    print("Random Seed: {}".format(random_seed))
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)

# logging variables
running_reward = 0
avg_length = 0
max_avg_length = 0
time_step = 0

# checkpoint = ppo.load(f'./PPO_continuous_{env_name}_best.pth')
# checkpoint = ppo.load(f'./PPO_continuous_{env_name}.pth')
# max_avg_length = checkpoint["avg_length"]

# training loop
for i_episode in range(0, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        time_step +=1
        # Running policy_old:
        action = ppo.select_action(state, memory)
        state, reward, done, _ = env.step(action)

        # Saving reward and is_terminals:
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        # update if its time
        if time_step % update_timestep == 0:
            ppo.update(memory)
            memory.clear_memory()
            time_step = 0
        running_reward += reward
        if render:
            env.render()
        if done:
            break

    avg_length += t

    # save every 500 episodes
    if i_episode % 500 == 0:
        ppo.save(f'./PPO_continuous_{env_name}.pth',avg_length=avg_length)

    # logging
    if i_episode % log_interval == 0:
        avg_length = avg_length/log_interval
        running_reward = running_reward/log_interval
        writer.add_scalar("avg_length/train", avg_length, i_episode)
        writer.add_scalar("running_reward/train", running_reward, i_episode)
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            ppo.save(f'./PPO_continuous_solved_{env_name}.pth',avg_length=avg_length)
            break
            
        if avg_length>max_avg_length:
            max_avg_length = avg_length
            ppo.save(f'./PPO_continuous_{env_name}_best.pth',avg_length=avg_length)
        elif np.random.random()<0.1:# 50% chance 
            checkpoint = ppo.load(f'./PPO_continuous_{env_name}_best.pth')
            print(f"load old best,avg_length={checkpoint['avg_length']}")# restart

        print(f'Episode {i_episode} \t Avg length: {avg_length:.0f} \t Avg reward: {running_reward:.0f}')
        running_reward = 0
        avg_length = 0
        
env.pause()

0.0002 (0.9, 0.999)
Episode 0 	 Avg length: 1 	 Avg reward: 1
Episode 80 	 Avg length: 86 	 Avg reward: 108
Episode 160 	 Avg length: 121 	 Avg reward: 152




Episode 240 	 Avg length: 133 	 Avg reward: 167
Episode 320 	 Avg length: 148 	 Avg reward: 185
Episode 400 	 Avg length: 151 	 Avg reward: 189
Episode 480 	 Avg length: 161 	 Avg reward: 202
Episode 560 	 Avg length: 167 	 Avg reward: 210
Episode 640 	 Avg length: 183 	 Avg reward: 231
Episode 720 	 Avg length: 177 	 Avg reward: 223
Episode 800 	 Avg length: 186 	 Avg reward: 235
Episode 880 	 Avg length: 194 	 Avg reward: 245
Episode 960 	 Avg length: 195 	 Avg reward: 247
Episode 1040 	 Avg length: 198 	 Avg reward: 251
Episode 1120 	 Avg length: 209 	 Avg reward: 265
Episode 1200 	 Avg length: 212 	 Avg reward: 269
Episode 1280 	 Avg length: 201 	 Avg reward: 255
Episode 1360 	 Avg length: 196 	 Avg reward: 249
Episode 1440 	 Avg length: 216 	 Avg reward: 275
Episode 1520 	 Avg length: 217 	 Avg reward: 276
Episode 1600 	 Avg length: 217 	 Avg reward: 276
Episode 1680 	 Avg length: 218 	 Avg reward: 278
Episode 1760 	 Avg length: 211 	 Avg reward: 268
Episode 1840 	 Avg length: 214

In [13]:
# writer.add_scalar("baseline_length/train", 200, 0)
# writer.add_scalar("baseline_length/train", 200, log_interval)

In [4]:
memory_2 = Memory()

max_timesteps=3000
action_std=.1
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
ppo.load(f'./PPO_continuous_{env_name}_best.pth')

time_steps = []
durations = []
for k in range(8):
    state = env.reset()
    for t in range(max_timesteps):
        # Running policy_old:
        action = ppo.select_action(state, memory_2)
        state, reward, done, info = env.step(action)
#         state, reward, done, info = env.step()
        if done or t==max_timesteps-1:
            time_steps.append(t)
            episode_duration= info['t'] - env.episode_start_time
            durations.append(episode_duration)
            print(f"time steps:{t} ,duration: {episode_duration:.2f}[s], fps:{t/episode_duration:.0f}")
            break
            
print(f"mean time steps:{np.mean(time_steps):.0f} ,duration: {np.mean(durations):.2f}[s]")

time steps:606 ,duration: 6.43[s], fps:94
time steps:756 ,duration: 7.22[s], fps:105
time steps:435 ,duration: 4.03[s], fps:108
time steps:637 ,duration: 5.82[s], fps:109
time steps:396 ,duration: 3.53[s], fps:112
time steps:384 ,duration: 3.39[s], fps:113
time steps:608 ,duration: 5.31[s], fps:115
time steps:518 ,duration: 4.53[s], fps:114
mean time steps:542 ,duration: 5.03[s]


In [3]:
%load_ext line_profiler
state = env.reset()
memory_2 = Memory()
def test():
    for k in range(100):
        ppo.select_action(state, memory_2)
    
%lprun -f ppo.policy_old.act test()
# %lprun -f ppo.select_action test()



UsageError: Could not find function 'ppo.policy_old.act'.
NameError: name 'ppo' is not defined


In [3]:
torch.__version__

'1.7.0'