In [None]:
import argparse
import math
import gym
import numpy as np
import itertools
import torch
from sac import SAC
from tensorboardX import SummaryWriter
from normalized_actions import NormalizedActions
from replay_memory import ReplayMemory

'''
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env-name', default="BipedalWalker-v2",
                    help='name of the environment to run')
parser.add_argument('--policy', default="Gaussian",
                    help='algorithm to use: Gaussian | Deterministic')
parser.add_argument('--eval', type=bool, default=False,
                    help='Evaluate a policy (default:False)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
                    help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
                    help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
                    help='Temperature parameter α determines the relative importance of the entropy term against the reward (default: 0.2)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 543)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
                    help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000000, metavar='N',
                    help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
                    help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
                    help='model updates per simulator step (default: 1)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
                    help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                    help='size of replay buffer (default: 10000000)')
args = parser.parse_args()
'''

args = {
    "env_name": "BipedalWalker-v2",
    "policy": "Gaussian",
    "eval": False,
    "gamma": 0.99,
    "tau": 0.005,
    "lr": 0.0003,
    "alpha": 0.2,
    "seed": 543,
    "batch_size": 256,
    "num_steps": 1000000,
    "hidden_size": 256,
    "updates_per_step": 1,
    "target_update_interval": 1,
    "replay_size": 1000000
}    


# Environment
env = NormalizedActions(gym.make(args['env_name']))
env.seed(args['seed'])
torch.manual_seed(args['seed'])
np.random.seed(args['seed'])

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

writer = SummaryWriter()

# Memory
memory = ReplayMemory(args['replay_size'])

# Training Loop
rewards = []
total_numsteps = 0
updates = 0

for i_episode in itertools.count():
    state = env.reset()

    episode_reward = 0
    while True:
        action = agent.select_action(state)  # Sample action from policy
        next_state, reward, done, _ = env.step(action)  # Step
        mask = not done  # 1 for not done and 0 for done
        memory.push(state, action, reward, next_state, mask)  # Append transition to memory
        if len(memory) > args['batch_size']:
            for i in range(args['updates_per_step']): # Number of updates per step in environment
                # Sample a batch from memory
                state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(args['batch_size'])
                # Update parameters of all the networks
                value_loss, critic_1_loss, critic_2_loss, policy_loss = agent.update_parameters(state_batch, action_batch, 
                                                                                                reward_batch, next_state_batch, 
                                                                                                mask_batch, updates)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)
                updates += 1

        state = next_state
        total_numsteps += 1
        episode_reward += reward

        if done:
            break

    if total_numsteps > args['num_steps']:
        break

    writer.add_scalar('reward/train', episode_reward, i_episode)
    rewards.append(episode_reward)
    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, np.round(rewards[-1],2),
                                                                                np.round(np.mean(rewards[-100:]),2)))

env.close()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: <class 'normalized_actions.NormalizedActions'> doesn't implement 'action' method. Maybe it implements deprecated '_action' method.[0m
Episode: 0, total numsteps: 170, reward: -133.75, average reward: -133.75
Episode: 1, total numsteps: 1770, reward: -95.59, average reward: -114.67
Episode: 2, total numsteps: 3370, reward: -64.86, average reward: -98.07
Episode: 3, total numsteps: 3415, reward: -108.45, average reward: -100.66
Episode: 4, total numsteps: 3587, reward: -132.66, average reward: -107.06
Episode: 5, total numsteps: 3648, reward: -108.26, average reward: -107.26
Episode: 6, total numsteps: 5248, reward: -84.3, average reward: -103.98
Episode: 7, total numsteps: 5327, reward: -112.53, average reward: -105.05
Episode: 8, total numsteps: 5379, reward: -1

Episode: 103, total numsteps: 25678, reward: -125.05, average reward: -114.33
Episode: 104, total numsteps: 25757, reward: -124.29, average reward: -114.25
Episode: 105, total numsteps: 25842, reward: -121.81, average reward: -114.39
Episode: 106, total numsteps: 25949, reward: -124.3, average reward: -114.79
Episode: 107, total numsteps: 26049, reward: -100.96, average reward: -114.67
Episode: 108, total numsteps: 26132, reward: -117.61, average reward: -114.73
Episode: 109, total numsteps: 26234, reward: -121.81, average reward: -114.88
Episode: 110, total numsteps: 26331, reward: -122.6, average reward: -114.73
Episode: 111, total numsteps: 26372, reward: -109.08, average reward: -114.7
Episode: 112, total numsteps: 26412, reward: -109.23, average reward: -113.97
Episode: 113, total numsteps: 26519, reward: -121.79, average reward: -114.27
Episode: 114, total numsteps: 26573, reward: -117.46, average reward: -113.57
Episode: 115, total numsteps: 26614, reward: -108.87, average rewar

Episode: 210, total numsteps: 146187, reward: -68.59, average reward: -90.8
Episode: 211, total numsteps: 147787, reward: -77.56, average reward: -90.48
Episode: 212, total numsteps: 149387, reward: -73.47, average reward: -90.12
Episode: 213, total numsteps: 150987, reward: -82.62, average reward: -89.73
Episode: 214, total numsteps: 152587, reward: -70.47, average reward: -89.26
Episode: 215, total numsteps: 154187, reward: -81.09, average reward: -88.98
Episode: 216, total numsteps: 155787, reward: -76.93, average reward: -88.55
Episode: 217, total numsteps: 157387, reward: -75.93, average reward: -88.21
Episode: 218, total numsteps: 158987, reward: -74.35, average reward: -87.86
Episode: 219, total numsteps: 160587, reward: -74.05, average reward: -87.51
Episode: 220, total numsteps: 160665, reward: -116.58, average reward: -87.58
Episode: 221, total numsteps: 162265, reward: -79.41, average reward: -87.26
Episode: 222, total numsteps: 163865, reward: -77.08, average reward: -86.92

Episode: 316, total numsteps: 230051, reward: -103.03, average reward: -92.21
Episode: 317, total numsteps: 230096, reward: -102.11, average reward: -92.48
Episode: 318, total numsteps: 230145, reward: -101.31, average reward: -92.74
Episode: 319, total numsteps: 230196, reward: -101.28, average reward: -93.02
Episode: 320, total numsteps: 230247, reward: -101.5, average reward: -92.87
Episode: 321, total numsteps: 230292, reward: -101.2, average reward: -93.08
Episode: 322, total numsteps: 230347, reward: -100.02, average reward: -93.31
Episode: 323, total numsteps: 230404, reward: -99.03, average reward: -93.58
Episode: 324, total numsteps: 230460, reward: -99.54, average reward: -93.83
Episode: 325, total numsteps: 230511, reward: -102.94, average reward: -94.06
Episode: 326, total numsteps: 230564, reward: -101.17, average reward: -94.34
Episode: 327, total numsteps: 230638, reward: -96.22, average reward: -94.13
Episode: 328, total numsteps: 230698, reward: -99.04, average reward:

Episode: 421, total numsteps: 242373, reward: -86.27, average reward: -102.89
Episode: 422, total numsteps: 243973, reward: -77.1, average reward: -102.66
Episode: 423, total numsteps: 245573, reward: -82.87, average reward: -102.5
Episode: 424, total numsteps: 247173, reward: -82.55, average reward: -102.33
Episode: 425, total numsteps: 248773, reward: -71.01, average reward: -102.01
Episode: 426, total numsteps: 250373, reward: -83.18, average reward: -101.83
Episode: 427, total numsteps: 251973, reward: -80.06, average reward: -101.67
Episode: 428, total numsteps: 253573, reward: -73.35, average reward: -101.41
Episode: 429, total numsteps: 255173, reward: -81.42, average reward: -101.22
Episode: 430, total numsteps: 256773, reward: -79.68, average reward: -101.01
