In [None]:
import itertools
import argparse
import datetime
import sys
sys.path.insert(0,'../../envs/')
sys.path.insert(0,'../Core/')
import os
from utils import *
from global_vars import BATCH_SIZE, DT, SEED
from PegRobot2D import Frontend, WINDOW_X, WINDOW_Y
import numpy as np
import torch
from sac import SAC
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt
from rl_batch_trainer import BatchRLAlgorithm

save_dir = "models/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
actor_path = save_dir
if not os.path.exists(actor_path):
    os.makedirs(actor_path)
critic_path = save_dir
if not os.path.exists(critic_path):
    os.makedirs(critic_path)

variant = dict(
        algorithm="SAC",
        version="normal",
        seed = 0,
        replay_buffer_size=int(1e5),
        save_model = True,
        algorithm_kwargs=dict(
            num_epochs= 25,
            num_eval_steps_per_epoch= 2500,
            num_train_loops_per_epoch = 5,
            num_trains_per_train_loop= 100,
            num_expl_steps_per_train_loop = 1000,
            min_num_steps_before_training = 2500, # Random exploration steps Initially
            max_path_length=500,
            batch_size=256,
        ),
    
        trainer_kwargs=dict(
            gamma=0.99,
            tau=5e-3,
            target_update_interval=1,
            lr=3e-4,
            alpha = 0.2,
            policy = "Gaussian",
            automatic_entropy_tuning=True,
            hidden_size = 256
        ),
    
        env_args = [
            WINDOW_X,
            WINDOW_Y,
            "Peg 2D Robot"
            ],
    
        env_kwargs = dict(
            vsync = False,
            resizable = False,
            visible = False
            )
)

# Environment
env = Frontend(*variant['env_args'], **variant['env_kwargs'])
if variant['algorithm'] is "SAC":
    env.denorm_process = False # No need to denorm because in SAC the gaussian policies are already scaled up
    
torch.manual_seed(variant['seed'])
np.random.seed(variant['seed'])

# Agent
num_actions = env.num_actions
num_inputs = env.num_states
action_range = env.action_range

agent = SAC(num_inputs, num_actions, action_range, **variant['trainer_kwargs'])

# Tensorboard
log_dir = 'runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
                                        "Peg2DRobot",
                                        variant['trainer_kwargs']['policy'],
                                       "autotune" if variant['trainer_kwargs']['automatic_entropy_tuning'] else "")
writer = SummaryWriter(logdir=log_dir)

# Replay Memory
replay_buffer = ReplayBuffer(variant['replay_buffer_size'])

# ####################
# # MAIN training loop
# ####################
RL_trainer = BatchRLAlgorithm(replay_buffer, **variant['algorithm_kwargs']) 
RL_trainer.train(env, agent, writer)

if variant['save_model']:
    agent.save_model("Peg2D")
# # PER is a good idea for SAC because once we get BIG erwards... the critic is very unstable, 
# # so values with high TD error are definately more important
# # Normalised states?

Working on CPU, GPU is too old
Target Entropy -3
Initial Exploration ...
Finished Initial Exploration on 2500 steps 

Finished Epoch 0, replay size 7500
Finished Epoch 1, replay size 12500
Finished Epoch 2, replay size 17500
Finished Epoch 3, replay size 22500
Finished Epoch 4, replay size 27500

 MADE IT 1 TIMES TO GOAL 

Finished Epoch 5, replay size 32500

 MADE IT 2 TIMES TO GOAL 

Finished Epoch 6, replay size 37500


In [2]:
for attribute, value in RL_trainer.__dict__.items():
    print(attribute, '=', value)

batch_size = 256
max_path_length = 250
num_epochs = 50
num_eval_steps_per_epoch = 1000
num_trains_per_train_loop = 250
num_train_loops_per_epoch = 1
num_expl_steps_per_train_loop = 1000
min_num_steps_before_training = 2500
path_type = ['random', 'exploration', 'eval']
replay_buffer = <utils.ReplayBuffer object at 0x000002A7C788D080>
memory_steps = 52500
updates = 12500


In [2]:
from sac import SAC
from utils import *
from models import weights_init
import sys
import argparse
sys.path.insert(0,'../../envs/')
from PegRobot2D import Frontend, WINDOW_X, WINDOW_Y

variant = dict(
        algorithm="SAC",
        version="normal",
        seed = 123456,
        replay_buffer_size=int(1e5),
        save_model = True,
        algorithm_kwargs=dict(
            num_epochs=50,
            num_eval_steps_per_epoch=1000,
            num_trains_per_train_loop=250,
            num_expl_steps_per_train_loop=1000,
            min_num_steps_before_training=2500, # Random exploration steps Initially
            max_path_length=250,
            batch_size=256,
        ),
    
        trainer_kwargs=dict(
            gamma=0.99,
            tau=5e-3,
            target_update_interval=1,
            lr=3e-4,
            alpha = 0.2,
            policy = "Gaussian",
            automatic_entropy_tuning=True,
            hidden_size = 256
        ),
    
        env_args = [
            WINDOW_X,
            WINDOW_Y,
            "Peg 2D Robot"
            ],
    
        env_kwargs = dict(
            vsync = False,
            resizable = False,
            visible = False
            )
)

# Environment
env = Frontend(*variant['env_args'], **variant['env_kwargs'])
if variant['algorithm'] is "SAC":
    env.denorm_process = False # No need to denorm because in SAC the gaussian policies are already scaled up
    
torch.manual_seed(variant['seed'])
np.random.seed(variant['seed'])

def run_policy(agent, env = None, framework = "SAC"):
    if isinstance(env, Frontend):
        del(env)
    env = Frontend(WINDOW_X, WINDOW_Y, "RoboPeg2D Simulation", vsync = False, resizable = False, visible = True)
    env.agent = agent
    if framework is "SAC":
        env.denorm_process = False # Necessary for SAC
    env.run_policy(agent)

if __name__ == "__main__":
    env = Frontend(WINDOW_X, WINDOW_Y, "RoboPeg2D Simulation", vsync = False, resizable = False, visible = False)
    
    # Agent
    num_actions = env.num_actions
    num_inputs = env.num_states
    action_range = env.action_range

    tst_agent = SAC(num_inputs, num_actions, action_range, **variant['trainer_kwargs'])

    tst_agent.load_model(actor_path="models/actor_Peg2D_",critic_path="models/critic_Peg2D_")

    run_policy(tst_agent, env, "SAC")

Working on CPU, GPU is too old
Target Entropy -3
Loading models from models/actor_Peg2D_ and models/critic_Peg2D_
1058 371
0.5140625
Vec2d(1149.9917398551534, 363.04695559348136)
<pymunk.shapes.Poly object at 0x0000018BA00E9C18> Body(6074.15926535898, 20279593.06727851, Body.DYNAMIC)
959 372
0.43671875
Vec2d(1149.990763005496, 362.36617360457024)
<pymunk.shapes.Poly object at 0x0000018BA00E9C18> Body(6074.15926535898, 20279593.06727851, Body.DYNAMIC)
