In [1]:
import os
import torch
import gym
import numpy as np
from TD3_torch.TD3 import TD3
from PIL import Image
from TD3_torch.utils import ReplayBuffer

env_name = 'BipedalWalkerHardcore-v2'
learning_rate_base = 0.0001
log_interval = 10           # print avg reward after interval
random_seed = 123
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
exploration_noise_base = 0.3 
polyak_int = [0.9999, 0.999999]              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 2000000
directory = "./preTrained/td3_torch/{}".format(env_name) # save trained models
filename = "TD3_torch_{}_{}".format(env_name, random_seed)
reward_history = []



def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    polyak = polyak_int[0]
    exploration_noise = exploration_noise_base
    
    actor_config = [
        {'dim': (state_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 256), 'dropout': True, 'activation':'relu'},
        {'dim': (256, 128), 'dropout': True, 'activation': 'relu'},
        {'dim': (128, 64), 'dropout': False, 'activation': 'relu'},
        {'dim': (64, action_dim),'dropout': False, 'activation': False}
    ]
    
    critic_config = [
        {'dim': (state_dim + action_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 320), 'dropout': False , 'activation':'relu'},
        {'dim': (320, 160), 'dropout': False, 'activation': 'relu'},
        {'dim': (160, 1), 'dropout': False, 'activation': False}
    ]
    
    policy = TD3(actor_config, critic_config, max_action, lr=learning_rate_base)   
    replay_buffer = ReplayBuffer(max_length=max_buffer_length)
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))     
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # loading models
    policy.load(directory, filename)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
       
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
            
        # Calculate polyak
        #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        #if part > 1:
        #    part = 1
        #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     
        
        # Calculate LR
        part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        if part > 1:
            part = 1
        learning_rate = learning_rate_base - learning_rate_base * (1 - part) * 0.9
        policy.set_optimizers(lr=learning_rate)
        
        # Calculate Exploration Noise
        exploration_noise = exploration_noise_base - exploration_noise_base * (1 - part) * 0.9
        
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                episode, ep_reward, avg_reward, learning_rate, polyak, replay_buffer.get_fill(), exploration_noise, policy.actor_loss, policy.loss_Q1, policy.loss_Q2))

train()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): Dropout(p=0.2)
  (7): ReLU()
  (8): Linear(in_features=128, out_features=64, bias=True)
  (9): ReLU()
  (10): Linear(in_features=64, out_features=4, bias=True)
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): Dropout(p=0.2)
  (7): ReLU()
  (8): Linear(in_features=128, out_features=64, bias=True)
  (9): ReLU()
  (10): Line

Ep:520   Rew:-107.00  Avg Rew:-114.68  LR:0.00009294   Polyak:0.99990  Bf: 4  EN:0.2788  Loss: 0.176 4.608 4.119
Ep:530   Rew:-118.60  Avg Rew:-114.95  LR:0.00009299   Polyak:0.99990  Bf: 4  EN:0.2790  Loss: -0.764 16.864 19.738
Ep:540   Rew:-112.03  Avg Rew:-114.53  LR:0.00009291   Polyak:0.99990  Bf: 4  EN:0.2787  Loss: -0.699 9.118 8.386
Ep:550   Rew:-112.98  Avg Rew:-114.19  LR:0.00009284   Polyak:0.99990  Bf: 4  EN:0.2785  Loss: -0.541 11.146 15.216
Ep:560   Rew:-112.75  Avg Rew:-114.46  LR:0.00009289   Polyak:0.99990  Bf: 4  EN:0.2787  Loss: -0.699 2.923 3.456
Ep:570   Rew:-113.02  Avg Rew:-114.22  LR:0.00009284   Polyak:0.99990  Bf: 4  EN:0.2785  Loss: 0.122 5.138 4.763
Ep:580   Rew:-114.60  Avg Rew:-114.61  LR:0.00009292   Polyak:0.99990  Bf: 4  EN:0.2788  Loss: 0.268 7.745 3.808
Ep:590   Rew:-117.33  Avg Rew:-114.71  LR:0.00009294   Polyak:0.99990  Bf: 4  EN:0.2788  Loss: 0.323 9.312 6.427
Ep:600   Rew:-113.22  Avg Rew:-114.61  LR:0.00009292   Polyak:0.99990  Bf: 4  EN:0.2788 

Ep:1250   Rew:-106.59  Avg Rew:-107.40  LR:0.00009148   Polyak:0.99990  Bf: 7  EN:0.2744  Loss: 4.002 2.446 3.100
Ep:1260   Rew:-107.95  Avg Rew:-107.23  LR:0.00009145   Polyak:0.99990  Bf: 7  EN:0.2743  Loss: 4.257 7.205 7.886
Ep:1270   Rew:-105.67  Avg Rew:-107.49  LR:0.00009150   Polyak:0.99990  Bf: 7  EN:0.2745  Loss: 4.288 4.886 5.490
Ep:1280   Rew:-104.94  Avg Rew:-107.57  LR:0.00009151   Polyak:0.99990  Bf: 7  EN:0.2745  Loss: 3.705 2.528 3.045
Ep:1290   Rew:-104.15  Avg Rew:-107.54  LR:0.00009151   Polyak:0.99990  Bf: 7  EN:0.2745  Loss: 4.200 4.599 5.504
Ep:1300   Rew:-106.16  Avg Rew:-107.62  LR:0.00009152   Polyak:0.99990  Bf: 8  EN:0.2746  Loss: 4.511 5.570 6.719
Ep:1310   Rew:-105.81  Avg Rew:-107.55  LR:0.00009151   Polyak:0.99990  Bf: 8  EN:0.2745  Loss: 3.563 12.756 5.500
Ep:1320   Rew:-104.12  Avg Rew:-107.38  LR:0.00009148   Polyak:0.99990  Bf: 8  EN:0.2744  Loss: 4.965 4.991 5.488
Ep:1330   Rew:-107.80  Avg Rew:-107.46  LR:0.00009149   Polyak:0.99990  Bf: 8  EN:0.274

Ep:1970   Rew:-116.95  Avg Rew:-112.58  LR:0.00009252   Polyak:0.99990  Bf:10  EN:0.2775  Loss: 7.119 2.351 2.472
Ep:1980   Rew:-119.68  Avg Rew:-112.74  LR:0.00009255   Polyak:0.99990  Bf:10  EN:0.2776  Loss: 8.336 4.245 4.295
Ep:1990   Rew:-117.73  Avg Rew:-112.36  LR:0.00009247   Polyak:0.99990  Bf:10  EN:0.2774  Loss: 8.168 3.060 4.002
Ep:2000   Rew:-112.57  Avg Rew:-111.98  LR:0.00009240   Polyak:0.99990  Bf:11  EN:0.2772  Loss: 8.202 3.790 3.239
Ep:2010   Rew:-106.57  Avg Rew:-111.91  LR:0.00009238   Polyak:0.99990  Bf:11  EN:0.2771  Loss: 9.620 4.922 5.720
Ep:2020   Rew:-107.21  Avg Rew:-111.87  LR:0.00009237   Polyak:0.99990  Bf:11  EN:0.2771  Loss: 9.690 9.340 9.233
Ep:2030   Rew:-105.89  Avg Rew:-112.23  LR:0.00009245   Polyak:0.99990  Bf:11  EN:0.2773  Loss: 9.496 3.939 3.647
Ep:2040   Rew:-118.67  Avg Rew:-112.36  LR:0.00009247   Polyak:0.99990  Bf:11  EN:0.2774  Loss: 8.647 3.235 3.462
Ep:2050   Rew:-117.54  Avg Rew:-112.80  LR:0.00009256   Polyak:0.99990  Bf:11  EN:0.2777

Ep:2690   Rew:-121.60  Avg Rew:-118.31  LR:0.00009366   Polyak:0.99990  Bf:14  EN:0.2810  Loss: 12.109 2.007 1.769
Ep:2700   Rew:-120.59  Avg Rew:-118.63  LR:0.00009373   Polyak:0.99990  Bf:14  EN:0.2812  Loss: 14.330 2.953 2.777
Ep:2710   Rew:-126.39  Avg Rew:-119.41  LR:0.00009388   Polyak:0.99990  Bf:14  EN:0.2816  Loss: 13.824 6.412 6.296
Ep:2720   Rew:-107.86  Avg Rew:-120.00  LR:0.00009400   Polyak:0.99990  Bf:14  EN:0.2820  Loss: 12.386 2.404 2.626
Ep:2730   Rew:-118.93  Avg Rew:-121.05  LR:0.00009421   Polyak:0.99990  Bf:14  EN:0.2826  Loss: 12.896 2.484 2.401
Ep:2740   Rew:-128.27  Avg Rew:-121.15  LR:0.00009423   Polyak:0.99990  Bf:14  EN:0.2827  Loss: 12.337 1.760 1.854
Ep:2750   Rew:-106.90  Avg Rew:-121.42  LR:0.00009428   Polyak:0.99990  Bf:14  EN:0.2829  Loss: 12.913 3.228 3.065
Ep:2760   Rew:-137.93  Avg Rew:-121.73  LR:0.00009435   Polyak:0.99990  Bf:14  EN:0.2830  Loss: 11.987 2.312 2.160
Ep:2770   Rew:-135.48  Avg Rew:-122.51  LR:0.00009450   Polyak:0.99990  Bf:14  E

KeyboardInterrupt: 

In [None]:
def test():  
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    
    filename = "TD3_torch_{}_{}".format(env_name, random_seed)
    filename += ''
    directory = "./preTrained/td3_torch/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    dirname = './gif/td3_torch/{}'.format(ep)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_torch/{}/{}.jpg'.format(ep,t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    