In [1]:
import os
import torch
import gym
import numpy as np
from TD3_torch.TD3 import TD3
from PIL import Image
from TD3_torch.utils import ReplayBuffer

env_name = 'BipedalWalkerHardcore-v2'
learning_rate_base = 0.0001
log_interval = 10           # print avg reward after interval
random_seed = 555
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
exploration_noise = 0.3 
polyak_int = [0.9999, 0.999999]              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 2000000
directory = "./preTrained/td3_torch/{}".format(env_name) # save trained models
filename = "TD3_torch_{}_{}".format(env_name, random_seed)
reward_history = []



def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    polyak = polyak_int[0]
    
    actor_config = [
        {'dim': (state_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 320), 'dropout': True, 'activation':'relu'},
        {'dim': (320, 160), 'dropout': False, 'activation': 'relu'},
        {'dim': (160, 64), 'dropout': False, 'activation': 'relu'},
        {'dim': (64, action_dim),'dropout': False, 'activation': False}
    ]
    
    critic_config = [
        {'dim': (state_dim + action_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 320), 'dropout': True, 'activation':'relu'},
        {'dim': (320, 160), 'dropout': False, 'activation': 'relu'},
        {'dim': (160, 1), 'dropout': False, 'activation': False}
    ]
    
    policy = TD3(actor_config, critic_config, max_action, lr=learning_rate_base)   
    replay_buffer = ReplayBuffer(max_length=max_buffer_length)
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))     
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # loading models
    policy.load(directory, filename)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
       
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
            
        # Calculate polyak
        #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        #if part > 1:
        #    part = 1
        #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     
        
        # Calculate LR
        part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        if part > 1:
            part = 1
        learning_rate = learning_rate_base - learning_rate_base * (1 - part) * 0.9
        policy.set_optimizers(lr=learning_rate)
        
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Ep: {}   Rew: {:3.2f}   Avg Rew: {:3.2f}   LR: {:8.8f}   Polyak: {:6.6f}   Bf: {:2.0f}   Loss: {:5.3f}  {:5.3f}  {:5.3f}".format(
                episode, ep_reward, avg_reward, learning_rate, polyak, replay_buffer.get_fill(), policy.actor_loss, policy.loss_Q1, policy.loss_Q2))

train()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=320, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=320, out_features=160, bias=True)
  (6): ReLU()
  (7): Linear(in_features=160, out_features=64, bias=True)
  (8): ReLU()
  (9): Linear(in_features=64, out_features=4, bias=True)
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=320, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=320, out_features=160, bias=True)
  (6): ReLU()
  (7): Linear(in_features=160, out_features=64, bias=True)
  (8): ReLU()
  (9): Linear(in_features=64, out_features=4, bias=True)


Ep: 520   Rew: -54.59   Avg Rew: -53.45   LR: 0.00008069   Polyak: 0.999900   Bf: 51   Loss: 3.668  0.007  0.008
Ep: 530   Rew: -53.91   Avg Rew: -53.33   LR: 0.00008067   Polyak: 0.999900   Bf: 52   Loss: 3.783  0.015  0.014
Ep: 540   Rew: -55.08   Avg Rew: -53.74   LR: 0.00008075   Polyak: 0.999900   Bf: 53   Loss: 3.754  0.012  0.013
Ep: 550   Rew: -60.48   Avg Rew: -53.75   LR: 0.00008075   Polyak: 0.999900   Bf: 54   Loss: 3.680  0.007  0.010
Ep: 560   Rew: -59.55   Avg Rew: -54.00   LR: 0.00008080   Polyak: 0.999900   Bf: 55   Loss: 3.610  0.091  0.088
Ep: 570   Rew: -63.45   Avg Rew: -53.74   LR: 0.00008075   Polyak: 0.999900   Bf: 56   Loss: 3.598  0.027  0.024
Ep: 580   Rew: -55.35   Avg Rew: -53.97   LR: 0.00008079   Polyak: 0.999900   Bf: 57   Loss: 3.609  0.022  0.020
Ep: 590   Rew: -61.73   Avg Rew: -54.15   LR: 0.00008083   Polyak: 0.999900   Bf: 58   Loss: 3.707  0.045  0.182
Ep: 600   Rew: -38.96   Avg Rew: -54.57   LR: 0.00008091   Polyak: 0.999900   Bf: 59   Loss: 3.6

Ep: 1250   Rew: -41.06   Avg Rew: -57.79   LR: 0.00008156   Polyak: 0.999900   Bf: 100   Loss: 3.347  0.032  0.031
Ep: 1260   Rew: -45.27   Avg Rew: -58.55   LR: 0.00008171   Polyak: 0.999900   Bf: 100   Loss: 3.311  0.008  0.006
Ep: 1270   Rew: -38.02   Avg Rew: -57.45   LR: 0.00008149   Polyak: 0.999900   Bf: 100   Loss: 3.254  0.044  0.039
Ep: 1280   Rew: -50.81   Avg Rew: -57.36   LR: 0.00008147   Polyak: 0.999900   Bf: 100   Loss: 3.338  0.003  0.004
Ep: 1290   Rew: -50.70   Avg Rew: -57.65   LR: 0.00008153   Polyak: 0.999900   Bf: 100   Loss: 3.308  0.021  0.114
Ep: 1300   Rew: -45.28   Avg Rew: -56.85   LR: 0.00008137   Polyak: 0.999900   Bf: 100   Loss: 3.251  0.013  0.013
Ep: 1310   Rew: -54.85   Avg Rew: -56.42   LR: 0.00008128   Polyak: 0.999900   Bf: 100   Loss: 3.293  0.022  0.025
Ep: 1320   Rew: -39.60   Avg Rew: -54.56   LR: 0.00008091   Polyak: 0.999900   Bf: 100   Loss: 3.276  0.009  0.009
Ep: 1330   Rew: -40.75   Avg Rew: -52.03   LR: 0.00008041   Polyak: 0.999900   B

Ep: 1970   Rew: -67.13   Avg Rew: -72.07   LR: 0.00008441   Polyak: 0.999900   Bf: 100   Loss: 3.593  0.047  0.055
Ep: 1980   Rew: -56.73   Avg Rew: -74.69   LR: 0.00008494   Polyak: 0.999900   Bf: 100   Loss: 3.519  0.038  0.029
Ep: 1990   Rew: -74.76   Avg Rew: -77.27   LR: 0.00008545   Polyak: 0.999900   Bf: 100   Loss: 3.570  0.018  0.082
Ep: 2000   Rew: -56.18   Avg Rew: -77.55   LR: 0.00008551   Polyak: 0.999900   Bf: 100   Loss: 3.622  0.143  0.277
Ep: 2010   Rew: -57.62   Avg Rew: -80.23   LR: 0.00008605   Polyak: 0.999900   Bf: 100   Loss: 3.536  0.058  0.050
Ep: 2020   Rew: -100.97   Avg Rew: -81.98   LR: 0.00008640   Polyak: 0.999900   Bf: 100   Loss: 3.580  0.087  0.076
Ep: 2030   Rew: -68.49   Avg Rew: -82.88   LR: 0.00008658   Polyak: 0.999900   Bf: 100   Loss: 3.789  0.246  0.234
Ep: 2040   Rew: -110.12   Avg Rew: -84.97   LR: 0.00008699   Polyak: 0.999900   Bf: 100   Loss: 3.599  0.081  0.057
Ep: 2050   Rew: -58.92   Avg Rew: -85.12   LR: 0.00008702   Polyak: 0.999900  

KeyboardInterrupt: 

In [None]:
def test():  
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    
    filename = "TD3_torch_{}_{}".format(env_name, random_seed)
    filename += ''
    directory = "./preTrained/td3_torch/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    dirname = './gif/td3_torch/{}'.format(ep)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_torch/{}/{}.jpg'.format(ep,t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    