In [2]:
import os
# https://discuss.pytorch.org/t/how-to-change-the-default-device-of-gpu-device-ids-0/1041/24
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]=f"{1}"

import torch
print(f"current_device:{torch.cuda.current_device()}")
print(f"device_count:{torch.cuda.device_count()}")
device = torch.device("cuda")


import sys
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from ddpg import DDPGagent
from ppo.PPO_continuous import PPO,Memory
# from ddpg.utils import NormalizedEnv
from flexipod_env import FlexipodEnv

env = FlexipodEnv(dof = 12)
# env = NormalizedEnv(env)

############## Hyperparameters ##############
# env_name = "BipedalWalker-v3"
env_name = "flexipod"
render = True
solved_reward = 1500        # stop training if avg_reward > solved_reward
log_interval = 80           # print avg reward in the interval
# log_interval = 2           # print avg reward in the interval

max_episodes = 20000        # max training episodes
max_timesteps = 1500        # max timesteps in one episode

# update_timestep = 4000      # update policy every n timesteps
update_timestep = 3000      # update policy every n timesteps


action_std = 0.5            # constant std for action distribution (Multivariate Normal)
# action_std = 1.0          # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0002                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = None
#############################################
# creating environment
# env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

current_device:0
device_count:1


In [2]:
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/soft12dof_experiment_9')

In [3]:
# env = FlexipodEnv(dof = 12)
# self = env
# msg_rec,_,_,_ = env.step()

In [4]:
if random_seed:
    print("Random Seed: {}".format(random_seed))
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)

max_avg_length = 0
checkpoint = ppo.load(f'./PPO_continuous_{env_name}_best.pth')
max_avg_length = checkpoint["avg_length"]

# logging variables
running_reward = 0
avg_length = 0
# max_avg_length = 0
time_step = 0

# training loop
for i_episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        time_step +=1
        # Running policy_old:
        action = ppo.select_action(state, memory)
        state, reward, done, _ = env.step(action)

        # Saving reward and is_terminals:
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        # update if its time
        if time_step % update_timestep == 0:
            ppo.update(memory)
            memory.clear_memory()
            time_step = 0
        running_reward += reward
        if render:
            env.render()
        if done:
            break

    avg_length += t

    # save every 500 episodes
    if i_episode % 500 == 0:
        ppo.save(f'./PPO_continuous_{env_name}.pth',avg_length=avg_length)

    # logging
    if i_episode % log_interval == 0:
        avg_length = avg_length/log_interval
        running_reward = running_reward/log_interval
        writer.add_scalar("avg_length/train", avg_length, i_episode)
        writer.add_scalar("running_reward/train", running_reward, i_episode)
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            ppo.save(f'./PPO_continuous_solved_{env_name}.pth',avg_length=avg_length)
            break
            
        if avg_length>max_avg_length:
            max_avg_length = avg_length
            ppo.save(f'./PPO_continuous_{env_name}_best.pth',avg_length=avg_length)
        elif np.random.random()<0.1:# 50% chance 
            checkpoint = ppo.load(f'./PPO_continuous_{env_name}_best.pth')
            print(f"load old best,avg_length={checkpoint['avg_length']}")# restart

        print(f'Episode {i_episode} \t Avg length: {avg_length:.0f} \t Avg reward: {running_reward:.0f}')
        running_reward = 0
        avg_length = 0
        
env.pause()

0.0002 (0.9, 0.999)




Episode 80 	 Avg length: 119 	 Avg reward: 150
Episode 160 	 Avg length: 141 	 Avg reward: 178
Episode 240 	 Avg length: 150 	 Avg reward: 189
Episode 320 	 Avg length: 176 	 Avg reward: 224
Episode 400 	 Avg length: 209 	 Avg reward: 266
Episode 480 	 Avg length: 247 	 Avg reward: 316
Episode 560 	 Avg length: 304 	 Avg reward: 390
Episode 640 	 Avg length: 311 	 Avg reward: 399
Episode 720 	 Avg length: 337 	 Avg reward: 434
Episode 800 	 Avg length: 347 	 Avg reward: 447
Episode 880 	 Avg length: 387 	 Avg reward: 500
Episode 960 	 Avg length: 429 	 Avg reward: 554
Episode 1040 	 Avg length: 365 	 Avg reward: 471
Episode 1120 	 Avg length: 376 	 Avg reward: 484
Episode 1200 	 Avg length: 434 	 Avg reward: 560
Episode 1280 	 Avg length: 449 	 Avg reward: 580
load old best,avg_length=448.5875
Episode 1360 	 Avg length: 365 	 Avg reward: 472
load old best,avg_length=448.5875
Episode 1440 	 Avg length: 372 	 Avg reward: 481
load old best,avg_length=448.5875
Episode 1520 	 Avg length: 33



Episode 3920 	 Avg length: 455 	 Avg reward: 589
Episode 4000 	 Avg length: 481 	 Avg reward: 622
Episode 4080 	 Avg length: 460 	 Avg reward: 595
Episode 4160 	 Avg length: 438 	 Avg reward: 564
Episode 4240 	 Avg length: 447 	 Avg reward: 573
Episode 4320 	 Avg length: 437 	 Avg reward: 562
load old best,avg_length=535.7625
Episode 4400 	 Avg length: 437 	 Avg reward: 562
load old best,avg_length=535.7625
Episode 4480 	 Avg length: 454 	 Avg reward: 586
Episode 4560 	 Avg length: 484 	 Avg reward: 626
Episode 4640 	 Avg length: 532 	 Avg reward: 687
Episode 4720 	 Avg length: 559 	 Avg reward: 721
Episode 4800 	 Avg length: 616 	 Avg reward: 796
Episode 4880 	 Avg length: 662 	 Avg reward: 858
Episode 4960 	 Avg length: 716 	 Avg reward: 928
Episode 5040 	 Avg length: 593 	 Avg reward: 768
Episode 5120 	 Avg length: 564 	 Avg reward: 730
Episode 5200 	 Avg length: 491 	 Avg reward: 636
Episode 5280 	 Avg length: 448 	 Avg reward: 580
Episode 5360 	 Avg length: 462 	 Avg reward: 597
E

KeyboardInterrupt: 

In [13]:
# writer.add_scalar("baseline_length/train", 200, 0)
# writer.add_scalar("baseline_length/train", 200, log_interval)

In [3]:
memory_2 = Memory()

max_timesteps=3000
action_std=.01
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
ppo.load(f'./PPO_continuous_{env_name}_best.pth')

time_steps = []
durations = []
for k in range(20):
    state = env.reset()
    for t in range(max_timesteps):
        # Running policy_old:
        action = ppo.select_action(state, memory_2)
        state, reward, done, info = env.step(action)
#         state, reward, done, _ = env.step()
        if done or t==max_timesteps-1:
            time_steps.append(t)
            episode_duration= info['t'] - env.episode_start_time
            durations.append(episode_duration)
            print(f"time steps:{t} ,duration: {episode_duration:.2f}[s], fps:{t/episode_duration:.0f}")
            break
            
print(f"mean time steps:{np.mean(time_steps):.0f} ,duration: {np.mean(durations):.2f}[s]")

time steps:1192 ,duration: 16.46[s], fps:72
time steps:2999 ,duration: 39.21[s], fps:76
time steps:2999 ,duration: 39.64[s], fps:76
time steps:2999 ,duration: 40.30[s], fps:74
time steps:1282 ,duration: 17.68[s], fps:73
time steps:1695 ,duration: 22.17[s], fps:76
time steps:2189 ,duration: 28.23[s], fps:78
time steps:1148 ,duration: 14.67[s], fps:78
time steps:1183 ,duration: 15.24[s], fps:78
time steps:1767 ,duration: 22.91[s], fps:77
time steps:1449 ,duration: 18.85[s], fps:77
time steps:1689 ,duration: 22.20[s], fps:76
time steps:231 ,duration: 3.27[s], fps:71
time steps:977 ,duration: 12.64[s], fps:77
time steps:2014 ,duration: 26.07[s], fps:77
time steps:432 ,duration: 5.60[s], fps:77
time steps:1237 ,duration: 16.18[s], fps:76
time steps:1076 ,duration: 13.95[s], fps:77
time steps:492 ,duration: 6.59[s], fps:75




time steps:1781 ,duration: 23.10[s], fps:77
mean time steps:1542 ,duration: 20.25[s]
