In [24]:
import os
# https://discuss.pytorch.org/t/how-to-change-the-default-device-of-gpu-device-ids-0/1041/24
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]=f"{1}"

import torch
print(f"current_device:{torch.cuda.current_device()}")
print(f"device_count:{torch.cuda.device_count()}")
device = torch.device("cuda")


import sys
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from ddpg import DDPGagent
from ppo.PPO_continuous import PPO,Memory
# from ddpg.utils import NormalizedEnv
from flexipod_env import FlexipodEnv

env = FlexipodEnv(dof = 12)
# env = NormalizedEnv(env)

############## Hyperparameters ##############
# env_name = "BipedalWalker-v3"
env_name = "flexipod"
render = True
solved_reward = 300         # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 20000        # max training episodes
max_timesteps = 1500        # max timesteps in one episode

update_timestep = 4000      # update policy every n timesteps
action_std = 0.5            # constant std for action distribution (Multivariate Normal)
# action_std = 1.0          # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0003                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = None
#############################################
# creating environment
# env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

current_device:0
device_count:1
Destructor called, FlexipodEnv deleted.


In [25]:
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/soft12dof_experiment_4')

In [26]:
# env = FlexipodEnv(dof = 12)
# self = env
# msg_rec,_,_,_ = env.step()

In [27]:
if random_seed:
    print("Random Seed: {}".format(random_seed))
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)
ppo.policy_old.load_state_dict(torch.load(f'./PPO_continuous_{env_name}_best.pth'))

# logging variables
running_reward = 0
avg_length = 0
max_avg_length = 0
time_step = 0

# training loop
for i_episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        time_step +=1
        # Running policy_old:
        action = ppo.select_action(state, memory)
        state, reward, done, _ = env.step(action)

        # Saving reward and is_terminals:
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        # update if its time
        if time_step % update_timestep == 0:
            ppo.update(memory)
            memory.clear_memory()
            time_step = 0
        running_reward += reward
        if render:
            env.render()
        if done:
            break

    avg_length += t

    # stop training if avg_reward > solved_reward
    if running_reward > (log_interval*solved_reward):
        print("########## Solved! ##########")
        torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(env_name))
        break

    # save every 500 episodes
    if i_episode % 500 == 0:
        torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name))

    # logging
    if i_episode % log_interval == 0:
        avg_length = avg_length/log_interval
        running_reward = running_reward/log_interval
        writer.add_scalar("avg_length/train", avg_length, i_episode)
        writer.add_scalar("running_reward/train", running_reward, i_episode)
        if avg_length>max_avg_length:
            max_avg_length = avg_length
            torch.save(ppo.policy.state_dict(), f'./PPO_continuous_{env_name}_best.pth')

        print(f'Episode {i_episode} \t Avg length: {avg_length:.0f} \t Avg reward: {running_reward:.0f}')
        running_reward = 0
        avg_length = 0
        
env.pause()

0.0003 (0.9, 0.999)
Episode 20 	 Avg length: 184 	 Avg reward: 236
Episode 40 	 Avg length: 164 	 Avg reward: 209
Episode 60 	 Avg length: 177 	 Avg reward: 226
Episode 80 	 Avg length: 159 	 Avg reward: 203
Episode 100 	 Avg length: 174 	 Avg reward: 221
Episode 120 	 Avg length: 184 	 Avg reward: 235
Episode 140 	 Avg length: 171 	 Avg reward: 219
Episode 160 	 Avg length: 188 	 Avg reward: 240
Episode 180 	 Avg length: 184 	 Avg reward: 234
Episode 200 	 Avg length: 184 	 Avg reward: 234
Episode 220 	 Avg length: 180 	 Avg reward: 230
Episode 240 	 Avg length: 183 	 Avg reward: 233
Episode 260 	 Avg length: 182 	 Avg reward: 232
Episode 280 	 Avg length: 194 	 Avg reward: 247
Episode 300 	 Avg length: 186 	 Avg reward: 239
Episode 320 	 Avg length: 186 	 Avg reward: 238
Episode 340 	 Avg length: 191 	 Avg reward: 243
Episode 360 	 Avg length: 200 	 Avg reward: 255
Episode 380 	 Avg length: 193 	 Avg reward: 248
Episode 400 	 Avg length: 191 	 Avg reward: 243
Episode 420 	 Avg length

Episode 3380 	 Avg length: 194 	 Avg reward: 243
Episode 3400 	 Avg length: 204 	 Avg reward: 254
Episode 3420 	 Avg length: 199 	 Avg reward: 248
Episode 3440 	 Avg length: 199 	 Avg reward: 247
Episode 3460 	 Avg length: 198 	 Avg reward: 247
Episode 3480 	 Avg length: 200 	 Avg reward: 248
Episode 3500 	 Avg length: 199 	 Avg reward: 248
Episode 3520 	 Avg length: 199 	 Avg reward: 249
Episode 3540 	 Avg length: 210 	 Avg reward: 260
Episode 3560 	 Avg length: 190 	 Avg reward: 238
Episode 3580 	 Avg length: 197 	 Avg reward: 245
Episode 3600 	 Avg length: 194 	 Avg reward: 242
Episode 3620 	 Avg length: 193 	 Avg reward: 240
Episode 3640 	 Avg length: 185 	 Avg reward: 231
Episode 3660 	 Avg length: 195 	 Avg reward: 243
Episode 3680 	 Avg length: 194 	 Avg reward: 242
Episode 3700 	 Avg length: 217 	 Avg reward: 271
Episode 3720 	 Avg length: 201 	 Avg reward: 252
Episode 3740 	 Avg length: 193 	 Avg reward: 242
Episode 3760 	 Avg length: 199 	 Avg reward: 249
Episode 3780 	 Avg l

Episode 6740 	 Avg length: 198 	 Avg reward: 250
Episode 6760 	 Avg length: 194 	 Avg reward: 242
Episode 6780 	 Avg length: 213 	 Avg reward: 267
Episode 6800 	 Avg length: 214 	 Avg reward: 270
Episode 6820 	 Avg length: 196 	 Avg reward: 247
Episode 6840 	 Avg length: 204 	 Avg reward: 257
Episode 6860 	 Avg length: 206 	 Avg reward: 259
Episode 6880 	 Avg length: 194 	 Avg reward: 244
Episode 6900 	 Avg length: 206 	 Avg reward: 259
Episode 6920 	 Avg length: 204 	 Avg reward: 256
Episode 6940 	 Avg length: 201 	 Avg reward: 255
Episode 6960 	 Avg length: 190 	 Avg reward: 241
Episode 6980 	 Avg length: 193 	 Avg reward: 245
Episode 7000 	 Avg length: 180 	 Avg reward: 226
Episode 7020 	 Avg length: 194 	 Avg reward: 244
Episode 7040 	 Avg length: 198 	 Avg reward: 248
Episode 7060 	 Avg length: 194 	 Avg reward: 245
Episode 7080 	 Avg length: 203 	 Avg reward: 255
Episode 7100 	 Avg length: 204 	 Avg reward: 258
Episode 7120 	 Avg length: 198 	 Avg reward: 249
Episode 7140 	 Avg l

Episode 10100 	 Avg length: 204 	 Avg reward: 259
Episode 10120 	 Avg length: 204 	 Avg reward: 259
Episode 10140 	 Avg length: 218 	 Avg reward: 276
Episode 10160 	 Avg length: 195 	 Avg reward: 247
Episode 10180 	 Avg length: 200 	 Avg reward: 254
Episode 10200 	 Avg length: 205 	 Avg reward: 260
Episode 10220 	 Avg length: 199 	 Avg reward: 252
Episode 10240 	 Avg length: 211 	 Avg reward: 269
Episode 10260 	 Avg length: 199 	 Avg reward: 252
Episode 10280 	 Avg length: 194 	 Avg reward: 246
Episode 10300 	 Avg length: 190 	 Avg reward: 242
Episode 10320 	 Avg length: 186 	 Avg reward: 237
Episode 10340 	 Avg length: 198 	 Avg reward: 253
Episode 10360 	 Avg length: 175 	 Avg reward: 222
Episode 10380 	 Avg length: 179 	 Avg reward: 227
Episode 10400 	 Avg length: 194 	 Avg reward: 246
Episode 10420 	 Avg length: 207 	 Avg reward: 264
Episode 10440 	 Avg length: 194 	 Avg reward: 248
Episode 10460 	 Avg length: 195 	 Avg reward: 248
Episode 10480 	 Avg length: 210 	 Avg reward: 266


Episode 13380 	 Avg length: 187 	 Avg reward: 239
Episode 13400 	 Avg length: 190 	 Avg reward: 243
Episode 13420 	 Avg length: 210 	 Avg reward: 269
Episode 13440 	 Avg length: 196 	 Avg reward: 250
Episode 13460 	 Avg length: 200 	 Avg reward: 256
Episode 13480 	 Avg length: 207 	 Avg reward: 264
Episode 13500 	 Avg length: 213 	 Avg reward: 273
Episode 13520 	 Avg length: 218 	 Avg reward: 279
Episode 13540 	 Avg length: 192 	 Avg reward: 246
Episode 13560 	 Avg length: 213 	 Avg reward: 272
Episode 13580 	 Avg length: 205 	 Avg reward: 262
Episode 13600 	 Avg length: 199 	 Avg reward: 255
Episode 13620 	 Avg length: 209 	 Avg reward: 267
Episode 13640 	 Avg length: 205 	 Avg reward: 261
Episode 13660 	 Avg length: 201 	 Avg reward: 257
Episode 13680 	 Avg length: 206 	 Avg reward: 264
Episode 13700 	 Avg length: 196 	 Avg reward: 251
Episode 13720 	 Avg length: 198 	 Avg reward: 253
Episode 13740 	 Avg length: 208 	 Avg reward: 265
Episode 13760 	 Avg length: 211 	 Avg reward: 270


Episode 16660 	 Avg length: 217 	 Avg reward: 277
Episode 16680 	 Avg length: 199 	 Avg reward: 255
Episode 16700 	 Avg length: 208 	 Avg reward: 268
Episode 16720 	 Avg length: 206 	 Avg reward: 263
Episode 16740 	 Avg length: 211 	 Avg reward: 270
Episode 16760 	 Avg length: 204 	 Avg reward: 262
Episode 16780 	 Avg length: 210 	 Avg reward: 269
Episode 16800 	 Avg length: 209 	 Avg reward: 267
Episode 16820 	 Avg length: 205 	 Avg reward: 262
Episode 16840 	 Avg length: 214 	 Avg reward: 274
Episode 16860 	 Avg length: 204 	 Avg reward: 261
Episode 16880 	 Avg length: 210 	 Avg reward: 269
Episode 16900 	 Avg length: 191 	 Avg reward: 245
Episode 16920 	 Avg length: 212 	 Avg reward: 272
Episode 16940 	 Avg length: 225 	 Avg reward: 288
Episode 16960 	 Avg length: 207 	 Avg reward: 266
Episode 16980 	 Avg length: 210 	 Avg reward: 269
Episode 17000 	 Avg length: 200 	 Avg reward: 256
Episode 17020 	 Avg length: 226 	 Avg reward: 290
Episode 17040 	 Avg length: 209 	 Avg reward: 268


Episode 19940 	 Avg length: 201 	 Avg reward: 258
Episode 19960 	 Avg length: 213 	 Avg reward: 273
Episode 19980 	 Avg length: 194 	 Avg reward: 248
Episode 20000 	 Avg length: 204 	 Avg reward: 261


In [13]:
# writer.add_scalar("baseline_length/train", 200, 0)
# writer.add_scalar("baseline_length/train", 200, log_interval)

In [46]:
memory_2 = Memory()

action_std=.1
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
ppo.policy_old.load_state_dict(torch.load(f'./PPO_continuous_{env_name}_best.pth'))
# ppo.policy_old.load_state_dict(torch.load(f'./PPO_continuous_{env_name}.pth'))

time_steps = []
for k in range(10):
    state = env.reset()
    for t in range(max_timesteps):
        # Running policy_old:
        action = ppo.select_action(state, memory_2)
        state, reward, done, _ = env.step(action)
#         state, reward, done, _ = env.step()

        if done:
            time_steps.append(t)
            print(t)
            break
print(f"mean time steps:{np.mean(time_steps)}")

166
193
154
200
276
222
198
190
161
177
mean time steps:193.7
