In [None]:
import os
import argparse
import math
import gym
import numpy as np
import itertools
import torch
from PIL import Image
from SAC.sac import SAC
from tensorboardX import SummaryWriter
from SAC.normalized_actions import NormalizedActions
from SAC.replay_memory import ReplayMemory

'''
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env-name', default="BipedalWalker-v2",
                    help='name of the environment to run')
parser.add_argument('--policy', default="Gaussian",
                    help='algorithm to use: Gaussian | Deterministic')
parser.add_argument('--eval', type=bool, default=False,
                    help='Evaluate a policy (default:False)')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor for reward (default: 0.99)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
                    help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
                    help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
                    help='Temperature parameter α determines the relative importance of the entropy term against the reward (default: 0.2)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 543)')
parser.add_argument('--batch_size', type=int, default=256, metavar='N',
                    help='batch size (default: 256)')
parser.add_argument('--num_steps', type=int, default=1000000, metavar='N',
                    help='maximum number of steps (default: 1000000)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
                    help='hidden size (default: 256)')
parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
                    help='model updates per simulator step (default: 1)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
                    help='Value target update per no. of updates per step (default: 1)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                    help='size of replay buffer (default: 10000000)')
args = parser.parse_args()
'''

args = {
    "env_name": "BipedalWalker-v2",
    "policy": "Gaussian",
    "eval": False,
    "gamma": 0.99,
    "tau": 0.005,
    "lr": 0.0003,
    "alpha": 0.2,
    "seed": 543,
    "batch_size": 256,
    "num_steps": 5000000,
    "steps_in_episode": 2000,
    "hidden_size": 256,
    "updates_per_step": 1,
    "target_update_interval": 1,
    "replay_size": 1000000
}    


# Environment
env = NormalizedActions(gym.make(args['env_name']))
env.seed(args['seed'])
torch.manual_seed(args['seed'])
np.random.seed(args['seed'])

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

writer = SummaryWriter()

# Memory
memory = ReplayMemory(args['replay_size'])

# Training Loop
rewards = []
total_numsteps = 0
updates = 0

for i_episode in itertools.count():
    state = env.reset()

    episode_reward = 0
    for t in range(args['steps_in_episode']):
        action = agent.select_action(state)  # Sample action from policy
        next_state, reward, done, _ = env.step(action)  # Step
        mask = not done  # 1 for not done and 0 for done
        memory.push(state, action, reward, next_state, mask)  # Append transition to memory
        if len(memory) > args['batch_size']:
            for i in range(args['updates_per_step']): # Number of updates per step in environment
                # Sample a batch from memory
                state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(args['batch_size'])
                # Update parameters of all the networks
                value_loss, critic_1_loss, critic_2_loss, policy_loss = agent.update_parameters(state_batch, action_batch, 
                                                                                                reward_batch, next_state_batch, 
                                                                                                mask_batch, updates)

                writer.add_scalar('loss/value', value_loss, updates)
                writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                writer.add_scalar('loss/policy', policy_loss, updates)
                updates += 1

        state = next_state
        total_numsteps += 1
        episode_reward += reward

        if done:
            break

    if total_numsteps > args['num_steps']:
        break

    writer.add_scalar('reward/train', episode_reward, i_episode)
    rewards.append(episode_reward)
    print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, np.round(rewards[-1],2),
                                                                                np.round(np.mean(rewards[-100:]),2)))




[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: <class 'SAC.normalized_actions.NormalizedActions'> doesn't implement 'action' method. Maybe it implements deprecated '_action' method.[0m
Episode: 0, total numsteps: 170, reward: -133.75, average reward: -133.75
Episode: 1, total numsteps: 1770, reward: -96.37, average reward: -115.06
Episode: 2, total numsteps: 1833, reward: -118.98, average reward: -116.37
Episode: 3, total numsteps: 1923, reward: -122.09, average reward: -117.8
Episode: 4, total numsteps: 2019, reward: -127.31, average reward: -119.7
Episode: 5, total numsteps: 3619, reward: -83.25, average reward: -113.63
Episode: 6, total numsteps: 3767, reward: -109.26, average reward: -113.0
Episode: 7, total numsteps: 3837, reward: -107.22, average reward: -112.28
Episode: 8, total numsteps: 5230, reward

Episode: 104, total numsteps: 119436, reward: -102.24, average reward: -92.78
Episode: 105, total numsteps: 119508, reward: -118.28, average reward: -93.13
Episode: 106, total numsteps: 119564, reward: -102.85, average reward: -93.06
Episode: 107, total numsteps: 119700, reward: -104.88, average reward: -93.04
Episode: 108, total numsteps: 119769, reward: -99.86, average reward: -92.54
Episode: 109, total numsteps: 119873, reward: -101.9, average reward: -92.49
Episode: 110, total numsteps: 121473, reward: -84.96, average reward: -92.46
Episode: 111, total numsteps: 123073, reward: -79.93, average reward: -92.21
Episode: 112, total numsteps: 123153, reward: -100.08, average reward: -92.26
Episode: 113, total numsteps: 123242, reward: -100.09, average reward: -92.38
Episode: 114, total numsteps: 123329, reward: -103.3, average reward: -92.35
Episode: 115, total numsteps: 123421, reward: -98.91, average reward: -92.25
Episode: 116, total numsteps: 123536, reward: -107.38, average reward:

Episode: 211, total numsteps: 235473, reward: -71.75, average reward: -87.32
Episode: 212, total numsteps: 237073, reward: -71.8, average reward: -87.04
Episode: 213, total numsteps: 238673, reward: -75.09, average reward: -86.79
Episode: 214, total numsteps: 240273, reward: -75.78, average reward: -86.52
Episode: 215, total numsteps: 241873, reward: -75.9, average reward: -86.29
Episode: 216, total numsteps: 243473, reward: -71.65, average reward: -85.93
Episode: 217, total numsteps: 245073, reward: -73.94, average reward: -85.67
Episode: 218, total numsteps: 246673, reward: -76.92, average reward: -85.42
Episode: 219, total numsteps: 248273, reward: -76.56, average reward: -84.94
Episode: 220, total numsteps: 249873, reward: -71.85, average reward: -84.9
Episode: 221, total numsteps: 251473, reward: -68.09, average reward: -84.57
Episode: 222, total numsteps: 253073, reward: -69.29, average reward: -84.2
Episode: 223, total numsteps: 254673, reward: -71.66, average reward: -83.89
Epi

Episode: 318, total numsteps: 406673, reward: -71.98, average reward: -73.46
Episode: 319, total numsteps: 408273, reward: -71.19, average reward: -73.4
Episode: 320, total numsteps: 409873, reward: -72.32, average reward: -73.41
Episode: 321, total numsteps: 411473, reward: -74.67, average reward: -73.47
Episode: 322, total numsteps: 413073, reward: -78.64, average reward: -73.57
Episode: 323, total numsteps: 414673, reward: -75.48, average reward: -73.61
Episode: 324, total numsteps: 416273, reward: -73.59, average reward: -73.61
Episode: 325, total numsteps: 417873, reward: -81.69, average reward: -73.74
Episode: 326, total numsteps: 419473, reward: -72.64, average reward: -73.76
Episode: 327, total numsteps: 421073, reward: -76.73, average reward: -73.8
Episode: 328, total numsteps: 422673, reward: -78.07, average reward: -73.8
Episode: 329, total numsteps: 424273, reward: -77.13, average reward: -73.83
Episode: 330, total numsteps: 425873, reward: -71.73, average reward: -73.79
Ep

Episode: 425, total numsteps: 577873, reward: -71.86, average reward: -73.59
Episode: 426, total numsteps: 579473, reward: -72.69, average reward: -73.59
Episode: 427, total numsteps: 581073, reward: -70.45, average reward: -73.53
Episode: 428, total numsteps: 582673, reward: -75.01, average reward: -73.5
Episode: 429, total numsteps: 584273, reward: -67.89, average reward: -73.41
Episode: 430, total numsteps: 585873, reward: -75.94, average reward: -73.45
Episode: 431, total numsteps: 587473, reward: -74.78, average reward: -73.46
Episode: 432, total numsteps: 589073, reward: -66.91, average reward: -73.38
Episode: 433, total numsteps: 590673, reward: -72.78, average reward: -73.36
Episode: 434, total numsteps: 592273, reward: -71.62, average reward: -73.32
Episode: 435, total numsteps: 593873, reward: -75.96, average reward: -73.39
Episode: 436, total numsteps: 595473, reward: -76.54, average reward: -73.41
Episode: 437, total numsteps: 597073, reward: -77.46, average reward: -73.47


Episode: 532, total numsteps: 749073, reward: -69.88, average reward: -73.14
Episode: 533, total numsteps: 750673, reward: -68.46, average reward: -73.09
Episode: 534, total numsteps: 752273, reward: -78.69, average reward: -73.16
Episode: 535, total numsteps: 753873, reward: -70.56, average reward: -73.11
Episode: 536, total numsteps: 755473, reward: -74.67, average reward: -73.09
Episode: 537, total numsteps: 757073, reward: -70.84, average reward: -73.03
Episode: 538, total numsteps: 758673, reward: -71.97, average reward: -72.98
Episode: 539, total numsteps: 760273, reward: -74.13, average reward: -72.93
Episode: 540, total numsteps: 761873, reward: -71.17, average reward: -72.96
Episode: 541, total numsteps: 763473, reward: -69.92, average reward: -72.94
Episode: 542, total numsteps: 765073, reward: -71.16, average reward: -72.96
Episode: 543, total numsteps: 766673, reward: -69.94, average reward: -73.01
Episode: 544, total numsteps: 768273, reward: -67.98, average reward: -72.98

Episode: 639, total numsteps: 920273, reward: -72.34, average reward: -72.2
Episode: 640, total numsteps: 921873, reward: -69.99, average reward: -72.19
Episode: 641, total numsteps: 923473, reward: -72.23, average reward: -72.21
Episode: 642, total numsteps: 925073, reward: -74.13, average reward: -72.24
Episode: 643, total numsteps: 926673, reward: -73.96, average reward: -72.28
Episode: 644, total numsteps: 928273, reward: -71.8, average reward: -72.32
Episode: 645, total numsteps: 929873, reward: -70.54, average reward: -72.29
Episode: 646, total numsteps: 931473, reward: -75.96, average reward: -72.34
Episode: 647, total numsteps: 933073, reward: -69.29, average reward: -72.37
Episode: 648, total numsteps: 934673, reward: -73.48, average reward: -72.43
Episode: 649, total numsteps: 936273, reward: -75.35, average reward: -72.4
Episode: 650, total numsteps: 937873, reward: -70.15, average reward: -72.39
Episode: 651, total numsteps: 939473, reward: -68.6, average reward: -72.36
Epi

Episode: 745, total numsteps: 1089873, reward: -69.69, average reward: -72.23
Episode: 746, total numsteps: 1091473, reward: -68.51, average reward: -72.16
Episode: 747, total numsteps: 1093073, reward: -72.52, average reward: -72.19
Episode: 748, total numsteps: 1094673, reward: -75.33, average reward: -72.21
Episode: 749, total numsteps: 1096273, reward: -71.04, average reward: -72.16
Episode: 750, total numsteps: 1097873, reward: -75.72, average reward: -72.22
Episode: 751, total numsteps: 1099473, reward: -68.2, average reward: -72.21
Episode: 752, total numsteps: 1101073, reward: -68.47, average reward: -72.13
Episode: 753, total numsteps: 1102673, reward: -71.28, average reward: -72.11
Episode: 754, total numsteps: 1104273, reward: -71.81, average reward: -72.1
Episode: 755, total numsteps: 1105873, reward: -72.53, average reward: -72.06
Episode: 756, total numsteps: 1107473, reward: -69.51, average reward: -72.01
Episode: 757, total numsteps: 1109073, reward: -73.63, average rew

Episode: 851, total numsteps: 1256378, reward: -70.05, average reward: -73.37
Episode: 852, total numsteps: 1257978, reward: -73.88, average reward: -73.43
Episode: 853, total numsteps: 1259578, reward: -67.89, average reward: -73.39
Episode: 854, total numsteps: 1261178, reward: -68.33, average reward: -73.36
Episode: 855, total numsteps: 1262778, reward: -70.2, average reward: -73.33
Episode: 856, total numsteps: 1264378, reward: -69.84, average reward: -73.34
Episode: 857, total numsteps: 1265978, reward: -70.99, average reward: -73.31
Episode: 858, total numsteps: 1267578, reward: -73.73, average reward: -73.33
Episode: 859, total numsteps: 1269178, reward: -71.06, average reward: -73.28
Episode: 860, total numsteps: 1270778, reward: -74.7, average reward: -73.3
Episode: 861, total numsteps: 1272378, reward: -73.94, average reward: -73.33
Episode: 862, total numsteps: 1273978, reward: -72.3, average reward: -73.29
Episode: 863, total numsteps: 1275578, reward: -75.14, average rewar

In [None]:
def test(agent):   
    random_seed = 0
    episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
     
    for i_episode in range(1, episodes):
        state = env.reset()
        episode_reward = 0
        for t in range(10000):  # Don't infinite loop while learning
            action = agent.select_action(state)  # Sample action from policy
            next_state, reward, done, _ = env.step(action)  # Step
           
            if render:
                env.render()  
                if save_gif:
                    dirname = './gif/sac/{}'.format(i_episode)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/sac/{}/{}.jpg'.format(i_episode,t))

            state = next_state            
            episode_reward += reward

            if done:
                break    
   
            
        print('Episode: {}\tReward: {}'.format(i_episode, int(episode_reward)))
        running_reward = 0
        env.close()        
                
test(agent)