In [None]:
import os
import argparse
import math
import gym
import numpy as np
import itertools
import torch
from PIL import Image
from SAC.sac import SAC
from tensorboardX import SummaryWriter
from SAC.normalized_actions import NormalizedActions
from SAC.replay_memory import ReplayMemory

'''
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env-name', default="BipedalWalker-v2",
                    help='name of the environment to run')
parser.add_argument('--policy', default="Gaussian",
                    help='algorithm to use: Gaussian | Deterministic')
parser.add_argument('--eval', type=bool, default=False,
                    help='Evaluate a policy (default:False)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
                    help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
                    help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
                    help='Temperature parameter α determines the relative importance of the entropy term against the reward (default: 0.2)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 543)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
                    help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000000, metavar='N',
                    help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
                    help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
                    help='model updates per simulator step (default: 1)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
                    help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                    help='size of replay buffer (default: 10000000)')
args = parser.parse_args()
'''

args = {
    "env_name": "BipedalWalker-v2",
    "policy": "Gaussian",
    "eval": False,
    "gamma": 0.99,
    "tau": 0.001,
    "lr": 0.0001,
    "alpha": 0.2,
    "seed": 123,
    "batch_size": 256,
    "num_steps": 5000000,
    "steps_in_episode": 2000,
    "hidden_size": 256,
    "updates_per_step": 1,
    "target_update_interval": 2,
    "replay_size": 20000
}    


# Environment
env = NormalizedActions(gym.make(args['env_name']))
env.seed(args['seed'])
torch.manual_seed(args['seed'])
np.random.seed(args['seed'])

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

writer = SummaryWriter()

# Memory
memory = ReplayMemory(args['replay_size'])

# Training Loop
rewards = []
total_numsteps = 0
updates = 0

for i_episode in itertools.count():
    state = env.reset()

    episode_reward = 0
    for t in range(args['steps_in_episode']):
        action = agent.select_action(state)  # Sample action from policy
        next_state, reward, done, _ = env.step(action)  # Step
        mask = not done  # 1 for not done and 0 for done
        memory.push(state, action, reward, next_state, mask)  # Append transition to memory
        if len(memory) > args['batch_size']:
            for i in range(args['updates_per_step']): # Number of updates per step in environment
                # Sample a batch from memory
                state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(args['batch_size'])
                # Update parameters of all the networks
                value_loss, critic_1_loss, critic_2_loss, policy_loss = agent.update_parameters(state_batch, action_batch, 
                                                                                                reward_batch, next_state_batch, 
                                                                                                mask_batch, updates)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)
                updates += 1

        state = next_state
        total_numsteps += 1
        episode_reward += reward

        if done:
            break

    if total_numsteps > args['num_steps']:
        break

    writer.add_scalar('reward/train', episode_reward, i_episode)
    rewards.append(episode_reward)
    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, np.round(rewards[-1],2),
                                                                                np.round(np.mean(rewards[-100:]),2)))




[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: <class 'SAC.normalized_actions.NormalizedActions'> doesn't implement 'action' method. Maybe it implements deprecated '_action' method.[0m
Episode: 0, total numsteps: 1600, reward: -67.62, average reward: -67.62
Episode: 1, total numsteps: 3200, reward: -68.79, average reward: -68.21
Episode: 2, total numsteps: 3252, reward: -110.21, average reward: -82.21
Episode: 3, total numsteps: 3311, reward: -113.16, average reward: -89.95
Episode: 4, total numsteps: 3395, reward: -119.08, average reward: -95.77
Episode: 5, total numsteps: 3465, reward: -105.71, average reward: -97.43
Episode: 6, total numsteps: 3536, reward: -124.84, average reward: -101.34
Episode: 7, total numsteps: 3671, reward: -100.81, average reward: -101.28
Episode: 8, total numsteps: 3737, reward: 

Episode: 104, total numsteps: 32444, reward: -91.9, average reward: -106.76
Episode: 105, total numsteps: 33644, reward: -182.82, average reward: -107.53
Episode: 106, total numsteps: 34650, reward: -174.35, average reward: -108.02
Episode: 107, total numsteps: 35826, reward: -180.86, average reward: -108.82
Episode: 108, total numsteps: 35907, reward: -101.31, average reward: -108.8
Episode: 109, total numsteps: 36914, reward: -174.96, average reward: -109.55
Episode: 110, total numsteps: 37700, reward: -163.44, average reward: -110.17
Episode: 111, total numsteps: 38361, reward: -157.82, average reward: -110.69
Episode: 112, total numsteps: 39100, reward: -162.08, average reward: -111.27
Episode: 113, total numsteps: 39823, reward: -162.1, average reward: -111.9
Episode: 114, total numsteps: 41131, reward: -192.09, average reward: -113.12
Episode: 115, total numsteps: 41812, reward: -158.02, average reward: -113.93
Episode: 116, total numsteps: 41912, reward: -102.23, average reward:

Episode: 210, total numsteps: 134163, reward: -80.15, average reward: -109.75
Episode: 211, total numsteps: 135763, reward: -65.17, average reward: -108.82
Episode: 212, total numsteps: 137363, reward: -77.89, average reward: -107.98
Episode: 213, total numsteps: 138963, reward: -80.15, average reward: -107.16
Episode: 214, total numsteps: 139095, reward: -125.81, average reward: -106.5
Episode: 215, total numsteps: 140695, reward: -70.98, average reward: -105.63
Episode: 216, total numsteps: 142295, reward: -63.61, average reward: -105.24
Episode: 217, total numsteps: 143895, reward: -80.54, average reward: -105.01
Episode: 218, total numsteps: 145495, reward: -76.99, average reward: -104.25
Episode: 219, total numsteps: 147095, reward: -87.05, average reward: -104.06
Episode: 220, total numsteps: 148695, reward: -75.43, average reward: -103.73
Episode: 221, total numsteps: 150295, reward: -92.93, average reward: -103.59
Episode: 222, total numsteps: 151895, reward: -82.4, average rew

Episode: 317, total numsteps: 293792, reward: -64.95, average reward: -77.25
Episode: 318, total numsteps: 295392, reward: -75.83, average reward: -77.24
Episode: 319, total numsteps: 296992, reward: -59.61, average reward: -76.96
Episode: 320, total numsteps: 298592, reward: -60.05, average reward: -76.81
Episode: 321, total numsteps: 298654, reward: -123.4, average reward: -77.12
Episode: 322, total numsteps: 300254, reward: -55.69, average reward: -76.85
Episode: 323, total numsteps: 301854, reward: -40.56, average reward: -76.27
Episode: 324, total numsteps: 303454, reward: -47.23, average reward: -75.78
Episode: 325, total numsteps: 305054, reward: -41.26, average reward: -75.37
Episode: 326, total numsteps: 306654, reward: -40.16, average reward: -74.97
Episode: 327, total numsteps: 308254, reward: -53.34, average reward: -74.67
Episode: 328, total numsteps: 309854, reward: -42.28, average reward: -73.93
Episode: 329, total numsteps: 311454, reward: -44.38, average reward: -73.51

Episode: 424, total numsteps: 444056, reward: -27.3, average reward: -58.09
Episode: 425, total numsteps: 445656, reward: -59.61, average reward: -58.28
Episode: 426, total numsteps: 447256, reward: -75.33, average reward: -58.63
Episode: 427, total numsteps: 448856, reward: -49.63, average reward: -58.59
Episode: 428, total numsteps: 450456, reward: -45.38, average reward: -58.62
Episode: 429, total numsteps: 452056, reward: -37.65, average reward: -58.56
Episode: 430, total numsteps: 453656, reward: -59.46, average reward: -58.79
Episode: 431, total numsteps: 455256, reward: -38.15, average reward: -58.78
Episode: 432, total numsteps: 456856, reward: -74.44, average reward: -58.87
Episode: 433, total numsteps: 458456, reward: -54.74, average reward: -58.8
Episode: 434, total numsteps: 460056, reward: -49.83, average reward: -58.82
Episode: 435, total numsteps: 461656, reward: -51.38, average reward: -58.94
Episode: 436, total numsteps: 463256, reward: -53.1, average reward: -59.04
Ep

In [None]:
def test(agent):   
    random_seed = 0
    episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
     
    for i_episode in range(1, episodes):
        state = env.reset()
        episode_reward = 0
        for t in range(10000):  # Don't infinite loop while learning
            action = agent.select_action(state)  # Sample action from policy
            next_state, reward, done, _ = env.step(action)  # Step
           
            if render:
                env.render()  
                if save_gif:
                    dirname = './gif/sac/{}'.format(i_episode)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/sac/{}/{}.jpg'.format(i_episode,t))

            state = next_state            
            episode_reward += reward

            if done:
                break    
   
            
        print('Episode: {}\tReward: {}'.format(i_episode, int(episode_reward)))
        running_reward = 0
        env.close()        
                
test(agent)