In [19]:
import os
# https://discuss.pytorch.org/t/how-to-change-the-default-device-of-gpu-device-ids-0/1041/24
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]=f"{1}"

import torch
print(f"current_device:{torch.cuda.current_device()}")
print(f"device_count:{torch.cuda.device_count()}")
device = torch.device("cuda")


import sys
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from ddpg import DDPGagent
from ppo.PPO_continuous import PPO,Memory
# from ddpg.utils import NormalizedEnv
from flexipod_env import FlexipodEnv

env = FlexipodEnv(dof = 12)
# env = NormalizedEnv(env)

############## Hyperparameters ##############
# env_name = "BipedalWalker-v3"
env_name = "flexipod"
render = True
solved_reward = 300         # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 10000        # max training episodes
max_timesteps = 1500        # max timesteps in one episode

update_timestep = 4000      # update policy every n timesteps
action_std = 0.5            # constant std for action distribution (Multivariate Normal)
# action_std = 1.0          # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0003                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = None
#############################################
# creating environment
# env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

current_device:0
device_count:1
Destructor called, FlexipodEnv deleted.


In [20]:
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/soft12dof_experiment_3')

In [21]:
# env = FlexipodEnv(dof = 12)
# self = env
# msg_rec,_,_,_ = env.step()

In [None]:
if random_seed:
    print("Random Seed: {}".format(random_seed))
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

memory = Memory()
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
print(lr,betas)
# ppo.policy_old.load_state_dict(torch.load(f'./PPO_continuous_{env_name}.pth'))

# logging variables
running_reward = 0
avg_length = 0
time_step = 0

# training loop
for i_episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        time_step +=1
        # Running policy_old:
        action = ppo.select_action(state, memory)
        state, reward, done, _ = env.step(action)

        # Saving reward and is_terminals:
        memory.rewards.append(reward)
        memory.is_terminals.append(done)

        # update if its time
        if time_step % update_timestep == 0:
            ppo.update(memory)
            memory.clear_memory()
            time_step = 0
        running_reward += reward
        if render:
            env.render()
        if done:
            break

    avg_length += t

    # stop training if avg_reward > solved_reward
    if running_reward > (log_interval*solved_reward):
        print("########## Solved! ##########")
        torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(env_name))
        break

    # save every 500 episodes
    if i_episode % 500 == 0:
        torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name))

    # logging
    if i_episode % log_interval == 0:
        avg_length = avg_length/log_interval
        running_reward = running_reward/log_interval
        writer.add_scalar("avg_length/train", avg_length, i_episode)
        writer.add_scalar("running_reward/train", running_reward, i_episode)

        print(f'Episode {i_episode} \t Avg length: {avg_length:.0f} \t Avg reward: {running_reward:.0f}')
        running_reward = 0
        avg_length = 0
        
env.pause()

0.0003 (0.9, 0.999)
Episode 20 	 Avg length: 173 	 Avg reward: 222
Episode 40 	 Avg length: 178 	 Avg reward: 230
Episode 60 	 Avg length: 174 	 Avg reward: 224
Episode 80 	 Avg length: 174 	 Avg reward: 225
Episode 100 	 Avg length: 193 	 Avg reward: 248
Episode 120 	 Avg length: 165 	 Avg reward: 212
Episode 140 	 Avg length: 181 	 Avg reward: 232
Episode 160 	 Avg length: 193 	 Avg reward: 248
Episode 180 	 Avg length: 192 	 Avg reward: 247
Episode 200 	 Avg length: 185 	 Avg reward: 239
Episode 220 	 Avg length: 181 	 Avg reward: 232
Episode 240 	 Avg length: 182 	 Avg reward: 234
Episode 260 	 Avg length: 180 	 Avg reward: 231
Episode 280 	 Avg length: 174 	 Avg reward: 223
Episode 300 	 Avg length: 174 	 Avg reward: 223
Episode 320 	 Avg length: 177 	 Avg reward: 227
Episode 340 	 Avg length: 165 	 Avg reward: 211
Episode 360 	 Avg length: 174 	 Avg reward: 223
Episode 380 	 Avg length: 176 	 Avg reward: 226


In [13]:
# writer.add_scalar("baseline_length/train", 200, 0)
# writer.add_scalar("baseline_length/train", 200, log_interval)

In [7]:
memory_2 = Memory()

# action_std=1.0
ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
ppo.policy_old.load_state_dict(torch.load(f'./PPO_continuous_{env_name}.pth'))

for k in range(50):
    state = env.reset()
    for t in range(max_timesteps):
        # Running policy_old:
        action = ppo.select_action(state, memory_2)
#         state, reward, done, _ = env.step(action)
        state, reward, done, _ = env.step()

        if done:
            print(t)
            break

232
237
233
236
233
233
225
226
235
228
231
227
230
225
228
239
235
237
236
223
239
236
235
242
230
235
235
235
235
223




timeout: timed out