In [1]:
import os
import argparse
import math
import gym
import numpy as np
import itertools
import torch
from PIL import Image
from SAC.sac import SAC
from tensorboardX import SummaryWriter
from SAC.normalized_actions import NormalizedActions
from SAC.replay_memory import ReplayMemory

'''
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env-name', default="BipedalWalker-v2",
                    help='name of the environment to run')
parser.add_argument('--policy', default="Gaussian",
                    help='algorithm to use: Gaussian | Deterministic')
parser.add_argument('--eval', type=bool, default=False,
                    help='Evaluate a policy (default:False)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
                    help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
                    help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
                    help='Temperature parameter α determines the relative importance of the entropy term against the reward (default: 0.2)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 543)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
                    help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000000, metavar='N',
                    help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
                    help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
                    help='model updates per simulator step (default: 1)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
                    help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                    help='size of replay buffer (default: 10000000)')
args = parser.parse_args()
'''

args = {
    "env_name": "BipedalWalker-v2",
    "policy": "Gaussian",
    "eval": False,
    "gamma": 0.99,
    "tau": 0.001,
    "lr": 0.001,
    "alpha": 0.2,
    "seed": 543,
    "batch_size": 256,
    "num_steps": 5000000,
    "hidden_size": 256,
    "updates_per_step": 1,
    "target_update_interval": 1,
    "replay_size": 1000000
}    


# Environment
env = NormalizedActions(gym.make(args['env_name']))
env.seed(args['seed'])
torch.manual_seed(args['seed'])
np.random.seed(args['seed'])

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

writer = SummaryWriter()

# Memory
memory = ReplayMemory(args['replay_size'])

# Training Loop
rewards = []
total_numsteps = 0
updates = 0

for i_episode in itertools.count():
    state = env.reset()

    episode_reward = 0
    for t in range(10000):
        action = agent.select_action(state)  # Sample action from policy
        next_state, reward, done, _ = env.step(action)  # Step
        mask = not done  # 1 for not done and 0 for done
        memory.push(state, action, reward, next_state, mask)  # Append transition to memory
        if len(memory) > args['batch_size']:
            for i in range(args['updates_per_step']): # Number of updates per step in environment
                # Sample a batch from memory
                state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(args['batch_size'])
                # Update parameters of all the networks
                value_loss, critic_1_loss, critic_2_loss, policy_loss = agent.update_parameters(state_batch, action_batch, 
                                                                                                reward_batch, next_state_batch, 
                                                                                                mask_batch, updates)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)
                updates += 1

        state = next_state
        total_numsteps += 1
        episode_reward += reward

        if done:
            break

    if total_numsteps > args['num_steps']:
        break

    writer.add_scalar('reward/train', episode_reward, i_episode)
    rewards.append(episode_reward)
    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, np.round(rewards[-1],2),
                                                                                np.round(np.mean(rewards[-100:]),2)))




[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: <class 'SAC.normalized_actions.NormalizedActions'> doesn't implement 'action' method. Maybe it implements deprecated '_action' method.[0m
Episode: 0, total numsteps: 170, reward: -133.75, average reward: -133.75
Episode: 1, total numsteps: 1770, reward: -79.8, average reward: -106.78
Episode: 2, total numsteps: 1866, reward: -102.0, average reward: -105.18
Episode: 3, total numsteps: 3466, reward: -95.41, average reward: -102.74
Episode: 4, total numsteps: 5066, reward: -67.44, average reward: -95.68
Episode: 5, total numsteps: 5125, reward: -101.64, average reward: -96.67
Episode: 6, total numsteps: 6725, reward: -74.66, average reward: -93.53
Episode: 7, total numsteps: 6783, reward: -109.92, average reward: -95.58
Episode: 8, total numsteps: 8383, reward: -76

Episode: 104, total numsteps: 66597, reward: -75.24, average reward: -98.47
Episode: 105, total numsteps: 68197, reward: -75.19, average reward: -98.2
Episode: 106, total numsteps: 69797, reward: -70.08, average reward: -98.16
Episode: 107, total numsteps: 71397, reward: -75.22, average reward: -97.81
Episode: 108, total numsteps: 72997, reward: -81.01, average reward: -97.85
Episode: 109, total numsteps: 74597, reward: -72.35, average reward: -97.58
Episode: 110, total numsteps: 76197, reward: -69.8, average reward: -97.26
Episode: 111, total numsteps: 76377, reward: -113.06, average reward: -97.41
Episode: 112, total numsteps: 77977, reward: -73.68, average reward: -97.14
Episode: 113, total numsteps: 79577, reward: -76.27, average reward: -96.81
Episode: 114, total numsteps: 81177, reward: -75.87, average reward: -96.62
Episode: 115, total numsteps: 81302, reward: -106.36, average reward: -96.69
Episode: 116, total numsteps: 82902, reward: -75.53, average reward: -96.4
Episode: 117,

Episode: 211, total numsteps: 234902, reward: -73.59, average reward: -76.12
Episode: 212, total numsteps: 236502, reward: -75.52, average reward: -76.14
Episode: 213, total numsteps: 238102, reward: -75.7, average reward: -76.14
Episode: 214, total numsteps: 239702, reward: -72.65, average reward: -76.11
Episode: 215, total numsteps: 241302, reward: -75.78, average reward: -75.8
Episode: 216, total numsteps: 242902, reward: -74.17, average reward: -75.79
Episode: 217, total numsteps: 244502, reward: -71.92, average reward: -75.76
Episode: 218, total numsteps: 246102, reward: -75.47, average reward: -75.73
Episode: 219, total numsteps: 247702, reward: -73.68, average reward: -75.66
Episode: 220, total numsteps: 249302, reward: -78.85, average reward: -75.61
Episode: 221, total numsteps: 250902, reward: -62.13, average reward: -75.44
Episode: 222, total numsteps: 252502, reward: -68.01, average reward: -75.39
Episode: 223, total numsteps: 254102, reward: -69.21, average reward: -75.28
E

Episode: 318, total numsteps: 406102, reward: -72.31, average reward: -73.16
Episode: 319, total numsteps: 407702, reward: -73.26, average reward: -73.15
Episode: 320, total numsteps: 409302, reward: -75.1, average reward: -73.12
Episode: 321, total numsteps: 410902, reward: -77.17, average reward: -73.27
Episode: 322, total numsteps: 412502, reward: -74.62, average reward: -73.33
Episode: 323, total numsteps: 414102, reward: -72.63, average reward: -73.37
Episode: 324, total numsteps: 415702, reward: -73.85, average reward: -73.33
Episode: 325, total numsteps: 417302, reward: -77.01, average reward: -73.39
Episode: 326, total numsteps: 418902, reward: -79.34, average reward: -73.46
Episode: 327, total numsteps: 420502, reward: -70.43, average reward: -73.39
Episode: 328, total numsteps: 422102, reward: -77.8, average reward: -73.42
Episode: 329, total numsteps: 423702, reward: -80.58, average reward: -73.48
Episode: 330, total numsteps: 425302, reward: -74.46, average reward: -73.47
E

Episode: 425, total numsteps: 577302, reward: -68.86, average reward: -74.32
Episode: 426, total numsteps: 578902, reward: -72.47, average reward: -74.25
Episode: 427, total numsteps: 580502, reward: -69.14, average reward: -74.24
Episode: 428, total numsteps: 582102, reward: -73.7, average reward: -74.2
Episode: 429, total numsteps: 583702, reward: -74.47, average reward: -74.14
Episode: 430, total numsteps: 585302, reward: -71.63, average reward: -74.11
Episode: 431, total numsteps: 586902, reward: -68.49, average reward: -74.08
Episode: 432, total numsteps: 586963, reward: -111.85, average reward: -74.5
Episode: 433, total numsteps: 588563, reward: -72.03, average reward: -74.5
Episode: 434, total numsteps: 590163, reward: -72.95, average reward: -74.47
Episode: 435, total numsteps: 591763, reward: -81.04, average reward: -74.54
Episode: 436, total numsteps: 593363, reward: -75.15, average reward: -74.57
Episode: 437, total numsteps: 594963, reward: -77.5, average reward: -74.59
Epi

KeyboardInterrupt: 

In [None]:
def test(agent):   
    random_seed = 0
    episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
     
    for i_episode in range(1, episodes):
        state = env.reset()
        episode_reward = 0
        for t in range(10000):  # Don't infinite loop while learning
            action = agent.select_action(state)  # Sample action from policy
            next_state, reward, done, _ = env.step(action)  # Step
           
            if render:
                env.render()  
                if save_gif:
                    dirname = './gif/sac/{}'.format(i_episode)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/sac/{}/{}.jpg'.format(i_episode,t))

            state = next_state            
            episode_reward += reward

            if done:
                break    
   
            
        print('Episode: {}\tReward: {}'.format(i_episode, int(episode_reward)))
        running_reward = 0
        env.close()        
                
test(agent)