# Training

In [1]:
import itertools
import datetime
import sys
sys.path.insert(0,'../../envs/')
sys.path.insert(0,'../core/')

import os
from utils import *
from global_vars import BATCH_SIZE, DT, SEED
from PegRobot2D import Frontend, WINDOW_X, WINDOW_Y
import numpy as np
import torch
from sac import SAC
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt
from rl_batch_trainer import BatchRLAlgorithm

variant = dict(
        algorithm="SAC",
        version="normal",
        seed = 1,
        save_model = True,

        algorithm_kwargs=dict(
            num_epochs= 25,
            num_eval_steps_per_epoch= 1000,
            num_train_loops_per_epoch = 10,
            num_trains_per_train_loop= 250, 
            num_expl_steps_per_train_loop = 1000,
            min_num_steps_before_training = 10000,
            max_path_length= 250,
            batch_size = 256,
            prioritised_experience = True,
            importance_sampling = True,
        ),

        trainer_kwargs=dict(
            gamma=0.99,
            tau=0.005,
            target_update_interval=1,
            lr= 5e-4, # Was 5e-3
            alpha = 0.2,
            policy = "Gaussian",
            automatic_entropy_tuning=True,
            hidden_size = 64,       
            delayed_policy_steps = 2,
            replay_buffer_size = int(2e5),
        ),

        env_args = [
            WINDOW_X,
            WINDOW_Y,
            "Peg 2D Robot"
            ],

        env_kwargs = dict(
            vsync = False,
            resizable = False,
            visible = False
            )
)

# Environment
env = Frontend(*variant['env_args'], **variant['env_kwargs'])
if variant['algorithm'] is "SAC":
    env.denorm_process = False # No need to denorm because in SAC the gaussian policies are already scaled up
elif variant['algorithm'] is "DDPG" or "TD3":
    env.denorm_process = True # Action outputs are in [-1, 1] so we need to denormalise them to pass into the envirnoment

torch.manual_seed(variant['seed'])
np.random.seed(variant['seed'])

# Agent
num_actions = env.num_actions
num_inputs = env.num_states
action_range = env.action_range

agent = SAC(num_inputs, num_actions, action_range, **variant['trainer_kwargs'])

# Tensorboard
log_dir = 'runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
                                        "Peg2DRobot",
                                        variant['trainer_kwargs']['policy'],
                                       "autotune" if variant['trainer_kwargs']['automatic_entropy_tuning'] else "")
writer = SummaryWriter(logdir=log_dir)

# training
RL_trainer = BatchRLAlgorithm(**variant['algorithm_kwargs']) 
RL_trainer.train(env, agent, writer)

# Save model 
if variant['save_model']:
    agent.save_model("Peg2D")

Loading chipmunk for Windows (64bit) [C:\Users\Alvaro\Anaconda3\lib\site-packages\pymunk\chipmunk.dll]
Working on CPU, GPU is too old
Target Entropy -3
Initial Exploration ...
Finished Initial Exploration on 10000 steps 

Finished Epoch 0, replay size 20000
Finished Epoch 1, replay size 30000
Finished Epoch 2, replay size 40000

 MADE IT 1 TIMES TO GOAL 


 MADE IT 2 TIMES TO GOAL 


 MADE IT 3 TIMES TO GOAL 

Finished Epoch 3, replay size 50000

 MADE IT 4 TIMES TO GOAL 


 MADE IT 5 TIMES TO GOAL 


 MADE IT 6 TIMES TO GOAL 


 MADE IT 7 TIMES TO GOAL 

Finished Epoch 4, replay size 60000

 MADE IT 8 TIMES TO GOAL 


 MADE IT 9 TIMES TO GOAL 


 MADE IT 10 TIMES TO GOAL 


 MADE IT 11 TIMES TO GOAL 

Finished Epoch 5, replay size 70000

 MADE IT 12 TIMES TO GOAL 


 MADE IT 13 TIMES TO GOAL 


 MADE IT 14 TIMES TO GOAL 


 MADE IT 15 TIMES TO GOAL 


 MADE IT 16 TIMES TO GOAL 


 MADE IT 17 TIMES TO GOAL 


 MADE IT 18 TIMES TO GOAL 


 MADE IT 19 TIMES TO GOAL 


 MADE IT 20 TIMES T

Finished Epoch 18, replay size 200000

 MADE IT 249 TIMES TO GOAL 


 MADE IT 250 TIMES TO GOAL 


 MADE IT 251 TIMES TO GOAL 


 MADE IT 252 TIMES TO GOAL 


 MADE IT 253 TIMES TO GOAL 


 MADE IT 254 TIMES TO GOAL 


 MADE IT 255 TIMES TO GOAL 


 MADE IT 256 TIMES TO GOAL 


 MADE IT 257 TIMES TO GOAL 


 MADE IT 258 TIMES TO GOAL 


 MADE IT 259 TIMES TO GOAL 


 MADE IT 260 TIMES TO GOAL 


 MADE IT 261 TIMES TO GOAL 


 MADE IT 262 TIMES TO GOAL 


 MADE IT 263 TIMES TO GOAL 


 MADE IT 264 TIMES TO GOAL 


 MADE IT 265 TIMES TO GOAL 


 MADE IT 266 TIMES TO GOAL 


 MADE IT 267 TIMES TO GOAL 

Finished Epoch 19, replay size 200000

 MADE IT 268 TIMES TO GOAL 


 MADE IT 269 TIMES TO GOAL 


 MADE IT 270 TIMES TO GOAL 


 MADE IT 271 TIMES TO GOAL 


 MADE IT 272 TIMES TO GOAL 


 MADE IT 273 TIMES TO GOAL 


 MADE IT 274 TIMES TO GOAL 


 MADE IT 275 TIMES TO GOAL 


 MADE IT 276 TIMES TO GOAL 


 MADE IT 277 TIMES TO GOAL 


 MADE IT 278 TIMES TO GOAL 


 MADE IT 279 TIMES TO G

In [2]:
print('\n Trainer ...')
for attribute, value in RL_trainer.__dict__.items():
    print(attribute, '=', value)
print('\n Agent ...')
for attribute, value in agent.__dict__.items():
    print(attribute, '=', value)


 Trainer ...
batch_size = 128
max_path_length = 250
num_epochs = 25
num_eval_steps_per_epoch = 1000
num_trains_per_train_loop = 250
num_train_loops_per_epoch = 10
num_expl_steps_per_train_loop = 1000
min_num_steps_before_training = 10000
path_type = ['random', 'exploration', 'eval']
memory_steps = 260000
updates = 62500
prioritised_experience = True
importance_sampling = True

 Agent ...
num_actions = 3
gamma = 0.99
tau = 0.005
alpha = tensor([0.0231], grad_fn=<ExpBackward>)
delayed_policy_steps = 2
policy_type = Gaussian
target_update_interval = 1
automatic_entropy_tuning = True
device = cpu
critic = QNetwork(
  (linear1): Linear(in_features=7, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=64, bias=True)
  (linear3): Linear(in_features=64, out_features=1, bias=True)
  (linear4): Linear(in_features=7, out_features=64, bias=True)
  (linear5): Linear(in_features=64, out_features=64, bias=True)
  (linear6): Linear(in_features=64, out_features=1, bias=True)


# Testing

In [1]:
import itertools
import datetime
import sys
sys.path.insert(0,'../../envs/')
sys.path.insert(0,'../core/')

import os
from utils import *
from PegRobot2D import Frontend, WINDOW_X, WINDOW_Y
import numpy as np
import torch
from sac import SAC
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt

variant = dict(
        algorithm="SAC",
        version="normal",
        seed = 1,
        save_model = True,

        algorithm_kwargs=dict(
            num_epochs= 25,
            num_eval_steps_per_epoch= 1000,
            num_train_loops_per_epoch = 10,
            num_trains_per_train_loop= 250, # Was 100
            num_expl_steps_per_train_loop = 1000,
            min_num_steps_before_training = 10000, # Random exploration steps Initially
            max_path_length= 250,
            batch_size = 128,
            prioritised_experience = True,
            importance_sampling = True,
        ),

        trainer_kwargs=dict(
            gamma=0.99,
            tau=0.005,
            target_update_interval=1,
            lr= 5e-4, # Was 5e-3
            alpha = 0.2,
            policy = "Gaussian",
            automatic_entropy_tuning=True,
            hidden_size = 64,       
            delayed_policy_steps = 2,
            replay_buffer_size = int(2e5),
        ),

        env_args = [
            WINDOW_X,
            WINDOW_Y,
            "Peg 2D Robot"
            ],

        env_kwargs = dict(
            vsync = False,
            resizable = False,
            visible = False
            )
)


# Environment
env = Frontend(*variant['env_args'], **variant['env_kwargs'])
if variant['algorithm'] is "SAC":
    env.denorm_process = False # No need to denorm because in SAC the gaussian policies are already scaled up
    
torch.manual_seed(variant['seed'])
np.random.seed(variant['seed'])

def run_policy(agent, env = None, framework = "SAC"):
    if isinstance(env, Frontend):
        del(env)
    env = Frontend(WINDOW_X, WINDOW_Y, "RoboPeg2D Simulation", vsync = False, resizable = False, visible = True)
    env.agent = agent
    if framework is "SAC":
        env.denorm_process = False # Necessary for SAC
    env.run_policy(agent)

if __name__ == "__main__":
    env = Frontend(WINDOW_X, WINDOW_Y, "RoboPeg2D Simulation", vsync = False, resizable = False, visible = False)
    
    # Agent
    num_actions = env.num_actions
    num_inputs = env.num_states
    action_range = env.action_range

    tst_agent = SAC(num_inputs, num_actions, action_range, **variant['trainer_kwargs'])

    tst_agent.load_model(actor_path="models/actor_Peg2D_",critic_path="models/critic_Peg2D_")

    run_policy(tst_agent, env, "SAC")

Loading chipmunk for Windows (64bit) [C:\Users\Alvaro\Anaconda3\lib\site-packages\pymunk\chipmunk.dll]
Working on CPU, GPU is too old
Target Entropy -3
Loading models from models/actor_Peg2D_ and models/critic_Peg2D_
Vec2d(1101.1398817570932, 361.5294493249713) Vec2d(1000.9914172792379, 357.24883719520295)
MADE IT, CONGRATS
Vec2d(1105.0170573081332, 360.87543170113526) Vec2d(1004.6675068305043, 357.201857853828)
MADE IT, CONGRATS
Vec2d(1108.5593185460052, 360.48431274922797) Vec2d(1008.2532596917827, 357.2893412999825)
MADE IT, CONGRATS
Vec2d(1113.187558631959, 361.3356919693094) Vec2d(1012.6431799063919, 357.4041037146654)
MADE IT, CONGRATS
Vec2d(1115.7142514812033, 361.75510794022324) Vec2d(1015.6239021103099, 357.6595196181699)
MADE IT, CONGRATS
Vec2d(1119.2614105170542, 359.9060775747631) Vec2d(1018.6099491819659, 357.88797381518964)
MADE IT, CONGRATS
Vec2d(1123.0325174712589, 360.18638837214866) Vec2d(1023.0316463432645, 358.3604922560003)
MADE IT, CONGRATS
Vec2d(1126.723032450544

In [3]:
run_policy(tst_agent, env, "SAC")

Vec2d(1102.6321982149818, 363.88186004587857) Vec2d(1001.527856774991, 358.6648449680093)
MADE IT, CONGRATS
Vec2d(1107.4376003673076, 360.5381900587722) Vec2d(1006.8668008404891, 357.10260999820287)
MADE IT, CONGRATS
Vec2d(1110.902459715265, 363.4731404076276) Vec2d(1010.2349324208942, 358.53220827848884)
MADE IT, CONGRATS
Vec2d(1114.0519146250824, 360.09519562487725) Vec2d(1013.7615448091445, 357.31217362628263)
MADE IT, CONGRATS
Vec2d(1117.7513194481792, 362.27096062456303) Vec2d(1016.7008852202462, 359.0390620097157)
MADE IT, CONGRATS
Vec2d(1121.106749289315, 359.09899033845323) Vec2d(1021.8711790994464, 357.42290087793003)
MADE IT, CONGRATS
Vec2d(1125.7427518018997, 362.7319239576031) Vec2d(1024.1547242397482, 359.84842951627815)
MADE IT, CONGRATS
Vec2d(1127.0750621415575, 359.77652751368475) Vec2d(1028.2277363595676, 357.8247713126838)
MADE IT, CONGRATS
Vec2d(1131.779808019248, 363.73491944459363) Vec2d(1030.1950932256702, 359.8026308913684)
MADE IT, CONGRATS
Vec2d(1134.9280299641

MADE IT, CONGRATS
Vec2d(1149.8610453475364, 359.043761388302) Vec2d(1049.8697084508594, 357.59104424774694)
MADE IT, CONGRATS
Vec2d(1149.8934573616664, 362.64677098832584) Vec2d(1051.119327022231, 359.08240341226946)
MADE IT, CONGRATS
Vec2d(1149.9767225202804, 357.52987398030444) Vec2d(1048.7778601702817, 356.9307155626634)
MADE IT, CONGRATS
Vec2d(1149.7368693606861, 363.05994643711034) Vec2d(1050.596471303907, 359.1255082847951)
MADE IT, CONGRATS
Vec2d(1149.7480500976994, 359.6369098326955) Vec2d(1050.5219870806227, 357.7554363930199)
MADE IT, CONGRATS
Vec2d(1149.5973556817928, 362.3725020120399) Vec2d(1050.5536876918675, 359.15146648006373)
MADE IT, CONGRATS
Vec2d(1149.6833871437213, 358.42965335615264) Vec2d(1049.6749616822256, 358.147721118752)
MADE IT, CONGRATS
Vec2d(1149.6503734639375, 361.4334846392477) Vec2d(1050.8471048300314, 359.66728891263995)
MADE IT, CONGRATS
Vec2d(1149.7265346407603, 364.6436895755276) Vec2d(1050.4551073873527, 360.3727409233585)
MADE IT, CONGRATS
Vec2d(

Vec2d(1149.8078785602916, 361.37289688645177) Vec2d(1050.2574062924696, 358.2524970826977)
MADE IT, CONGRATS
Vec2d(1148.1284784610316, 359.0752282073341) Vec2d(1048.262060281956, 360.8656633846282)
MADE IT, CONGRATS
Vec2d(1149.6059631084274, 362.70980982434514) Vec2d(1050.6008911294894, 358.6989064216245)
MADE IT, CONGRATS
Vec2d(1148.8042574543963, 362.1787469922815) Vec2d(1049.1754646420386, 362.54762956727325)
MADE IT, CONGRATS
Vec2d(1149.7323137213511, 359.6276251761698) Vec2d(1050.033963046969, 357.5113080938736)
MADE IT, CONGRATS
Vec2d(1149.4112580187834, 359.275918597027) Vec2d(1049.864295441357, 360.9260524823049)
MADE IT, CONGRATS
Vec2d(1149.6413893160072, 361.52582393627523) Vec2d(1050.7546233181054, 358.8068773621389)
MADE IT, CONGRATS
Vec2d(1149.9589553246562, 362.70389776130014) Vec2d(1050.968000738636, 361.4020154143372)
MADE IT, CONGRATS
Vec2d(1149.9062778342484, 358.0721648901577) Vec2d(1049.1987536015863, 357.40869652231095)
MADE IT, CONGRATS
