In [1]:
import os
import torch
import gym
import numpy as np
from TD3_torch.TD3 import TD3
from PIL import Image
from TD3_torch.utils import ReplayBuffer

env_name = 'BipedalWalkerHardcore-v2'
learning_rate_base = 0.0001
log_interval = 10           # print avg reward after interval
random_seed = 222
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
exploration_noise_base = 0.0 
polyak_int = [0.9999, 0.999999]              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 2000000
directory = "./preTrained/td3_torch/{}".format(env_name) # save trained models
filename = "TD3_torch_{}_{}".format(env_name, random_seed)
reward_history = []



def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    polyak = polyak_int[0]
    exploration_noise = exploration_noise_base
    
    actor_config = [
        {'dim': (state_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 320), 'dropout': True, 'activation':'relu'},
        {'dim': (320, 160), 'dropout': False, 'activation': 'relu'},
        {'dim': (160, 64), 'dropout': False, 'activation': 'relu'},
        {'dim': (64, action_dim),'dropout': False, 'activation': False}
    ]
    
    critic_config = [
        {'dim': (state_dim + action_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 320), 'dropout': False , 'activation':'relu'},
        {'dim': (320, 160), 'dropout': False, 'activation': 'relu'},
        {'dim': (160, 1), 'dropout': False, 'activation': False}
    ]
    
    policy = TD3(actor_config, critic_config, max_action, lr=learning_rate_base)   
    replay_buffer = ReplayBuffer(max_length=max_buffer_length)
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))     
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # loading models
    policy.load(directory, filename)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
       
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
            
        # Calculate polyak
        #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        #if part > 1:
        #    part = 1
        #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     
        
        # Calculate LR
        part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        if part > 1:
            part = 1
        learning_rate = learning_rate_base - learning_rate_base * (1 - part) * 0.9
        policy.set_optimizers(lr=learning_rate)
        
        # Calculate Exploration Noise
        exploration_noise = exploration_noise_base - exploration_noise_base * (1 - part) * 0.9
        
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                episode, ep_reward, avg_reward, learning_rate, polyak, replay_buffer.get_fill(), exploration_noise, policy.actor_loss, policy.loss_Q1, policy.loss_Q2))

train()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=320, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=320, out_features=160, bias=True)
  (6): ReLU()
  (7): Linear(in_features=160, out_features=64, bias=True)
  (8): ReLU()
  (9): Linear(in_features=64, out_features=4, bias=True)
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=320, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=320, out_features=160, bias=True)
  (6): ReLU()
  (7): Linear(in_features=160, out_features=64, bias=True)
  (8): ReLU()
  (9): Linear(in_features=64, out_features=4, bias=True)


Ep:550   Rew:53.77  Avg Rew:2.43  LR:0.00006951   Polyak:0.99990  Bf:48  EN:0.0000  Loss: 4.541 0.378 0.425
Ep:560   Rew:-3.53  Avg Rew:4.98  LR:0.00006900   Polyak:0.99990  Bf:49  EN:0.0000  Loss: 4.931 0.459 0.384
Ep:570   Rew:61.01  Avg Rew:6.92  LR:0.00006862   Polyak:0.99990  Bf:50  EN:0.0000  Loss: 4.801 1.450 0.842
Ep:580   Rew:19.51  Avg Rew:2.70  LR:0.00006946   Polyak:0.99990  Bf:51  EN:0.0000  Loss: 4.823 0.403 0.485
Ep:590   Rew:-63.39  Avg Rew:2.72  LR:0.00006946   Polyak:0.99990  Bf:51  EN:0.0000  Loss: 4.685 0.435 0.373
Ep:600   Rew:-59.82  Avg Rew:-0.81  LR:0.00007016   Polyak:0.99990  Bf:52  EN:0.0000  Loss: 4.694 0.619 0.633
Ep:610   Rew:25.44  Avg Rew:3.69  LR:0.00006926   Polyak:0.99990  Bf:53  EN:0.0000  Loss: 4.440 0.260 0.253
Ep:620   Rew:-136.86  Avg Rew:0.22  LR:0.00006996   Polyak:0.99990  Bf:54  EN:0.0000  Loss: 4.903 0.537 0.593
Ep:630   Rew:-34.43  Avg Rew:6.78  LR:0.00006864   Polyak:0.99990  Bf:55  EN:0.0000  Loss: 4.913 0.325 0.325
Ep:640   Rew:34.36  Av

Ep:1300   Rew:117.51  Avg Rew:32.47  LR:0.00006351   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.601 0.328 0.297
Ep:1310   Rew:-8.23  Avg Rew:34.40  LR:0.00006312   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.630 0.311 0.463
Ep:1320   Rew:-2.93  Avg Rew:35.83  LR:0.00006283   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.309 0.191 0.216
Ep:1330   Rew:-42.37  Avg Rew:37.18  LR:0.00006256   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.696 0.316 0.244
Ep:1340   Rew:50.53  Avg Rew:31.32  LR:0.00006374   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.450 0.479 0.465
Ep:1350   Rew:87.01  Avg Rew:24.64  LR:0.00006507   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.647 0.472 0.665
Ep:1360   Rew:-60.37  Avg Rew:22.98  LR:0.00006540   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.259 0.674 0.538
Ep:1370   Rew:-13.71  Avg Rew:20.48  LR:0.00006590   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.705 0.540 0.671
Ep:1380   Rew:75.47  Avg Rew:25.79  LR:0.00006484   Polyak:0.99990  Bf:100  EN:0.0000  Loss: 4.330 0.206 0.2

KeyboardInterrupt: 

In [None]:
def test():  
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    
    filename = "TD3_torch_{}_{}".format(env_name, random_seed)
    filename += ''
    directory = "./preTrained/td3_torch/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    dirname = './gif/td3_torch/{}'.format(ep)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_torch/{}/{}.jpg'.format(ep,t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    