In [1]:
import os
import torch
import gym
import numpy as np
from TD3_torch.TD3 import TD3
from PIL import Image
from TD3_torch.utils import ReplayBuffer

env_name = 'BipedalWalkerHardcore-v2'
learning_rate_base = 0.001
lr_decay = 0.00005
log_interval = 5           # print avg reward after interval
random_seed = 234
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
exploration_noise_base = 0.3 
noise_decay = 0.0001
polyak_int = [0.9999, 0.999999]              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
directory = "./preTrained/td3_torch/{}".format(env_name) # save trained models
filename = "TD3_torch_{}_{}".format(env_name, random_seed)
reward_history = []





In [2]:
def test():      
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    random_seed = 234
    
    filename = "TD3_torch_{}_{}".format(env_name, random_seed)
    filename += ''
    directory = "./preTrained/td3_torch/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    actor_config = [
        {'dim': (state_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 256), 'dropout': True, 'activation':'relu'},
        {'dim': (256, 128), 'dropout': False, 'activation': 'relu'},       
        {'dim': (128, action_dim),'dropout': False, 'activation': 'tanh'}
    ]
    
    critic_config = [
        {'dim': (state_dim + action_dim, 512), 'dropout': False, 'activation': 'relu'},
        {'dim': (512, 512), 'dropout': False , 'activation':'relu'},
        {'dim': (512, 128), 'dropout': False, 'activation': 'relu'},       
        {'dim': (128, 1), 'dropout': False, 'activation': False},
    ]
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    policy = TD3(actor_config, critic_config, max_action, lr=learning_rate_base)          
    policy.load(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    dirname = './gif/td3_torch/{}'.format(ep)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_torch/{}/{}.jpg'.format(ep,t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        

In [3]:

def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    polyak = polyak_int[0]
    exploration_noise = exploration_noise_base
    
    actor_config = [
        {'dim': (state_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 256), 'dropout': True, 'activation':'relu'},
        {'dim': (256, 128), 'dropout': False, 'activation': 'relu'},       
        {'dim': (128, action_dim),'dropout': False, 'activation': 'tanh'}
    ]
    
    critic_config = [
        {'dim': (state_dim + action_dim, 512), 'dropout': False, 'activation': 'relu'},
        {'dim': (512, 512), 'dropout': False , 'activation':'relu'},
        {'dim': (512, 128), 'dropout': False, 'activation': 'relu'},       
        {'dim': (128, 1), 'dropout': False, 'activation': False},
    ]
    
    policy = TD3(actor_config, critic_config, max_action, lr=learning_rate_base)   
    replay_buffer = ReplayBuffer(max_length=max_buffer_length)
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))     
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # loading models
    policy.load(directory, filename)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
       
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
            
        # Calculate polyak
        #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        #if part > 1:
        #    part = 1
        #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     
        
        # Calculate LR
        #part = min((env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150), 1)
        learning_rate = max(learning_rate_base / (1.0 + episode * lr_decay), 0.00001)
        #learning_rate = learning_rate_base - learning_rate_base * (1 - part) * 0.95
        policy.set_optimizers(lr=learning_rate)
        
        # Calculate Exploration Noise       
        exploration_noise = max(exploration_noise_base / (1.0 + episode * noise_decay), 0)
       
        avg_actor_loss = np.mean(policy.actor_loss_list[-100:])
        avg_Q1_loss = np.mean(policy.Q1_loss_list[-100:])
        avg_Q2_loss = np.mean(policy.Q2_loss_list[-100:])
        
        if len(policy.actor_loss_list) > 1000:
            policy.actor_loss_list.pop(0)
        if len(policy.Q1_loss_list) > 1000:
            policy.Q1_loss_list.pop(0)
        if len(policy.Q2_loss_list) > 1000:
            policy.Q2_loss_list.pop(0)  
        if len(reward_history) > 1000:
            reward_history.pop(0)    
                                
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                episode, ep_reward, avg_reward, learning_rate, polyak, replay_buffer.get_fill(), exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))

        if avg_reward > 250:
            test()


In [None]:
train()   

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): ReLU()
  (7): Linear(in_features=128, out_features=4, bias=True)
  (8): Tanh()
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): ReLU()
  (7): Linear(in_features=128, out_features=4, bias=True)
  (8): Tanh()
)
CRITIC=Sequential(
  (0): Linear(in_features=28, out_features=512, bias=True)
  (1): ReLU()
  (2): Linear(in_featu

Ep:275   Rew:-58.90  Avg Rew:-61.83  LR:0.00098644   Polyak:0.99990  Bf: 9  EN:0.2920  Loss: 5.251 0.431 0.433
Ep:280   Rew:-47.81  Avg Rew:-61.31  LR:0.00098619   Polyak:0.99990  Bf: 9  EN:0.2918  Loss: 5.215 0.470 0.481
Ep:285   Rew:-60.99  Avg Rew:-60.32  LR:0.00098595   Polyak:0.99990  Bf: 9  EN:0.2917  Loss: 5.208 0.383 0.408
Ep:290   Rew:-44.39  Avg Rew:-59.65  LR:0.00098571   Polyak:0.99990  Bf:10  EN:0.2915  Loss: 5.193 0.421 0.434
Ep:295   Rew:-65.23  Avg Rew:-58.73  LR:0.00098546   Polyak:0.99990  Bf:10  EN:0.2914  Loss: 5.197 0.416 0.427
Ep:300   Rew:-50.35  Avg Rew:-57.81  LR:0.00098522   Polyak:0.99990  Bf:10  EN:0.2913  Loss: 5.300 0.666 0.692
Ep:305   Rew:-51.84  Avg Rew:-57.64  LR:0.00098498   Polyak:0.99990  Bf:10  EN:0.2911  Loss: 5.266 0.608 0.592
Ep:310   Rew:-57.66  Avg Rew:-56.80  LR:0.00098474   Polyak:0.99990  Bf:10  EN:0.2910  Loss: 5.227 0.566 0.600
Ep:315   Rew:-79.93  Avg Rew:-58.80  LR:0.00098449   Polyak:0.99990  Bf:11  EN:0.2908  Loss: 5.239 0.505 0.519
E

Ep:645   Rew:-68.68  Avg Rew:-66.38  LR:0.00096876   Polyak:0.99990  Bf:23  EN:0.2818  Loss: 4.993 0.444 0.450
Ep:650   Rew:-57.78  Avg Rew:-67.89  LR:0.00096852   Polyak:0.99990  Bf:23  EN:0.2817  Loss: 4.991 0.468 0.482
Ep:655   Rew:-70.14  Avg Rew:-68.94  LR:0.00096829   Polyak:0.99990  Bf:24  EN:0.2816  Loss: 4.974 0.480 0.487
Ep:660   Rew:-59.69  Avg Rew:-69.19  LR:0.00096805   Polyak:0.99990  Bf:24  EN:0.2814  Loss: 4.967 0.452 0.508
Ep:665   Rew:-71.46  Avg Rew:-69.63  LR:0.00096782   Polyak:0.99990  Bf:24  EN:0.2813  Loss: 4.974 0.499 0.664
Ep:670   Rew:-67.18  Avg Rew:-68.94  LR:0.00096759   Polyak:0.99990  Bf:24  EN:0.2812  Loss: 4.950 0.497 0.488
Ep:675   Rew:-75.97  Avg Rew:-69.17  LR:0.00096735   Polyak:0.99990  Bf:24  EN:0.2810  Loss: 4.969 0.469 0.529
Ep:680   Rew:-81.92  Avg Rew:-69.24  LR:0.00096712   Polyak:0.99990  Bf:25  EN:0.2809  Loss: 4.960 0.447 0.504
Ep:685   Rew:-68.83  Avg Rew:-67.56  LR:0.00096688   Polyak:0.99990  Bf:25  EN:0.2808  Loss: 4.951 0.435 0.434
E

Ep:1015   Rew:-86.86  Avg Rew:-74.29  LR:0.00095170   Polyak:0.99990  Bf:38  EN:0.2724  Loss: 4.766 0.441 0.487
Ep:1020   Rew:-74.04  Avg Rew:-74.37  LR:0.00095147   Polyak:0.99990  Bf:38  EN:0.2722  Loss: 4.732 0.363 0.394
Ep:1025   Rew:-72.38  Avg Rew:-74.88  LR:0.00095125   Polyak:0.99990  Bf:38  EN:0.2721  Loss: 4.757 0.368 0.406
Ep:1030   Rew:-93.63  Avg Rew:-74.93  LR:0.00095102   Polyak:0.99990  Bf:39  EN:0.2720  Loss: 4.711 0.404 0.412
Ep:1035   Rew:-76.93  Avg Rew:-75.10  LR:0.00095080   Polyak:0.99990  Bf:39  EN:0.2719  Loss: 4.711 0.430 0.448
Ep:1040   Rew:-76.82  Avg Rew:-74.95  LR:0.00095057   Polyak:0.99990  Bf:39  EN:0.2717  Loss: 4.694 0.426 0.383
Ep:1045   Rew:-69.42  Avg Rew:-75.59  LR:0.00095034   Polyak:0.99990  Bf:39  EN:0.2716  Loss: 4.750 0.415 0.439
Ep:1050   Rew:-68.68  Avg Rew:-75.29  LR:0.00095012   Polyak:0.99990  Bf:39  EN:0.2715  Loss: 4.717 0.459 0.475
Ep:1055   Rew:-72.49  Avg Rew:-74.93  LR:0.00094989   Polyak:0.99990  Bf:40  EN:0.2714  Loss: 4.728 0.42

Ep:1385   Rew:-62.22  Avg Rew:-55.13  LR:0.00093523   Polyak:0.99990  Bf:53  EN:0.2635  Loss: 4.320 0.255 0.257
Ep:1390   Rew:-48.46  Avg Rew:-54.34  LR:0.00093502   Polyak:0.99990  Bf:53  EN:0.2634  Loss: 4.322 0.249 0.281
Ep:1395   Rew:-53.20  Avg Rew:-53.86  LR:0.00093480   Polyak:0.99990  Bf:53  EN:0.2633  Loss: 4.296 0.231 0.227
Ep:1400   Rew:-57.10  Avg Rew:-53.86  LR:0.00093458   Polyak:0.99990  Bf:53  EN:0.2632  Loss: 4.290 0.207 0.219
Ep:1405   Rew:-51.19  Avg Rew:-52.67  LR:0.00093436   Polyak:0.99990  Bf:54  EN:0.2630  Loss: 4.286 0.236 0.254
Ep:1410   Rew:-22.56  Avg Rew:-52.08  LR:0.00093414   Polyak:0.99990  Bf:54  EN:0.2629  Loss: 4.293 0.254 0.266
Ep:1415   Rew:-55.01  Avg Rew:-52.03  LR:0.00093392   Polyak:0.99990  Bf:54  EN:0.2628  Loss: 4.281 0.203 0.213
Ep:1420   Rew:-73.52  Avg Rew:-51.64  LR:0.00093371   Polyak:0.99990  Bf:54  EN:0.2627  Loss: 4.269 0.285 0.284
Ep:1425   Rew:-45.55  Avg Rew:-51.48  LR:0.00093349   Polyak:0.99990  Bf:54  EN:0.2626  Loss: 4.264 0.20

Ep:1755   Rew:-12.86  Avg Rew:-48.98  LR:0.00091933   Polyak:0.99990  Bf:67  EN:0.2552  Loss: 4.105 0.285 0.270
Ep:1760   Rew:-43.37  Avg Rew:-48.53  LR:0.00091912   Polyak:0.99990  Bf:67  EN:0.2551  Loss: 4.079 0.230 0.265
Ep:1765   Rew:-23.62  Avg Rew:-47.26  LR:0.00091891   Polyak:0.99990  Bf:67  EN:0.2550  Loss: 4.065 0.285 0.316
Ep:1770   Rew:-42.97  Avg Rew:-47.28  LR:0.00091870   Polyak:0.99990  Bf:68  EN:0.2549  Loss: 4.100 0.294 0.275
Ep:1775   Rew:-39.32  Avg Rew:-46.98  LR:0.00091848   Polyak:0.99990  Bf:68  EN:0.2548  Loss: 4.092 0.266 0.274
Ep:1780   Rew:-54.71  Avg Rew:-47.28  LR:0.00091827   Polyak:0.99990  Bf:68  EN:0.2547  Loss: 4.068 0.264 0.252
Ep:1785   Rew:-43.89  Avg Rew:-47.56  LR:0.00091806   Polyak:0.99990  Bf:68  EN:0.2546  Loss: 4.071 0.242 0.255
Ep:1790   Rew:-34.19  Avg Rew:-47.68  LR:0.00091785   Polyak:0.99990  Bf:68  EN:0.2545  Loss: 4.087 0.374 0.372
Ep:1795   Rew:-46.38  Avg Rew:-47.29  LR:0.00091764   Polyak:0.99990  Bf:69  EN:0.2543  Loss: 4.087 0.27

Ep:2125   Rew:-58.07  Avg Rew:-48.74  LR:0.00090395   Polyak:0.99990  Bf:82  EN:0.2474  Loss: 3.895 0.360 0.377
Ep:2130   Rew:-49.37  Avg Rew:-48.23  LR:0.00090375   Polyak:0.99990  Bf:82  EN:0.2473  Loss: 3.904 0.390 0.416
Ep:2135   Rew:-47.03  Avg Rew:-48.64  LR:0.00090355   Polyak:0.99990  Bf:82  EN:0.2472  Loss: 3.905 0.340 0.330
Ep:2140   Rew:-46.07  Avg Rew:-49.11  LR:0.00090334   Polyak:0.99990  Bf:82  EN:0.2471  Loss: 3.890 0.342 0.311
Ep:2145   Rew:-72.31  Avg Rew:-49.27  LR:0.00090314   Polyak:0.99990  Bf:83  EN:0.2470  Loss: 3.903 0.320 0.339
Ep:2150   Rew:-46.30  Avg Rew:-49.61  LR:0.00090293   Polyak:0.99990  Bf:83  EN:0.2469  Loss: 3.865 0.284 0.323
Ep:2155   Rew:-62.77  Avg Rew:-49.64  LR:0.00090273   Polyak:0.99990  Bf:83  EN:0.2468  Loss: 3.886 0.379 0.368
Ep:2160   Rew:-49.83  Avg Rew:-49.74  LR:0.00090253   Polyak:0.99990  Bf:83  EN:0.2467  Loss: 3.876 0.421 0.415
Ep:2165   Rew:-52.38  Avg Rew:-49.89  LR:0.00090232   Polyak:0.99990  Bf:83  EN:0.2466  Loss: 3.892 0.36

Ep:2495   Rew:-64.12  Avg Rew:-56.72  LR:0.00088909   Polyak:0.99990  Bf:96  EN:0.2401  Loss: 3.797 0.518 0.485
Ep:2500   Rew:-62.37  Avg Rew:-57.01  LR:0.00088889   Polyak:0.99990  Bf:97  EN:0.2400  Loss: 3.824 0.570 0.581
Ep:2505   Rew:-59.52  Avg Rew:-57.05  LR:0.00088869   Polyak:0.99990  Bf:97  EN:0.2399  Loss: 3.817 0.488 0.521
Ep:2510   Rew:-49.33  Avg Rew:-57.11  LR:0.00088849   Polyak:0.99990  Bf:97  EN:0.2398  Loss: 3.819 0.485 0.508
Ep:2515   Rew:-59.99  Avg Rew:-57.48  LR:0.00088830   Polyak:0.99990  Bf:97  EN:0.2397  Loss: 3.809 0.484 0.478
Ep:2520   Rew:-76.65  Avg Rew:-57.69  LR:0.00088810   Polyak:0.99990  Bf:97  EN:0.2396  Loss: 3.782 0.587 0.520
Ep:2525   Rew:-56.76  Avg Rew:-57.59  LR:0.00088790   Polyak:0.99990  Bf:98  EN:0.2395  Loss: 3.835 0.466 0.484
Ep:2530   Rew:-61.92  Avg Rew:-57.80  LR:0.00088771   Polyak:0.99990  Bf:98  EN:0.2394  Loss: 3.792 0.520 0.460
Ep:2535   Rew:-56.97  Avg Rew:-57.90  LR:0.00088751   Polyak:0.99990  Bf:98  EN:0.2393  Loss: 3.793 0.40

Ep:2860   Rew:-50.26  Avg Rew:-55.09  LR:0.00087489   Polyak:0.99990  Bf:100  EN:0.2333  Loss: 3.511 0.195 0.227
Ep:2865   Rew:-47.97  Avg Rew:-55.26  LR:0.00087470   Polyak:0.99990  Bf:100  EN:0.2332  Loss: 3.502 0.158 0.145
Ep:2870   Rew:-55.87  Avg Rew:-54.79  LR:0.00087451   Polyak:0.99990  Bf:100  EN:0.2331  Loss: 3.501 0.190 0.211
Ep:2875   Rew:-49.59  Avg Rew:-54.44  LR:0.00087432   Polyak:0.99990  Bf:100  EN:0.2330  Loss: 3.491 0.157 0.173
Ep:2880   Rew:-48.53  Avg Rew:-53.70  LR:0.00087413   Polyak:0.99990  Bf:100  EN:0.2329  Loss: 3.500 0.156 0.173
Ep:2885   Rew:-53.41  Avg Rew:-53.06  LR:0.00087393   Polyak:0.99990  Bf:100  EN:0.2328  Loss: 3.488 0.183 0.171
Ep:2890   Rew:-47.50  Avg Rew:-52.56  LR:0.00087374   Polyak:0.99990  Bf:100  EN:0.2327  Loss: 3.475 0.159 0.162
Ep:2895   Rew:-71.73  Avg Rew:-52.41  LR:0.00087355   Polyak:0.99990  Bf:100  EN:0.2326  Loss: 3.481 0.172 0.208
Ep:2900   Rew:-48.33  Avg Rew:-51.80  LR:0.00087336   Polyak:0.99990  Bf:100  EN:0.2326  Loss: 3

Ep:3225   Rew:-46.78  Avg Rew:-49.27  LR:0.00086114   Polyak:0.99990  Bf:100  EN:0.2268  Loss: 3.477 0.168 0.157
Ep:3230   Rew:-51.48  Avg Rew:-48.64  LR:0.00086096   Polyak:0.99990  Bf:100  EN:0.2268  Loss: 3.460 0.113 0.109
Ep:3235   Rew:-43.71  Avg Rew:-48.35  LR:0.00086077   Polyak:0.99990  Bf:100  EN:0.2267  Loss: 3.462 0.100 0.117
Ep:3240   Rew:-63.00  Avg Rew:-48.69  LR:0.00086059   Polyak:0.99990  Bf:100  EN:0.2266  Loss: 3.462 0.107 0.098
Ep:3245   Rew:-43.04  Avg Rew:-49.31  LR:0.00086040   Polyak:0.99990  Bf:100  EN:0.2265  Loss: 3.451 0.097 0.121
Ep:3250   Rew:-23.15  Avg Rew:-48.90  LR:0.00086022   Polyak:0.99990  Bf:100  EN:0.2264  Loss: 3.449 0.118 0.105
Ep:3255   Rew:-54.96  Avg Rew:-49.24  LR:0.00086003   Polyak:0.99990  Bf:100  EN:0.2263  Loss: 3.442 0.073 0.068
Ep:3260   Rew:-52.88  Avg Rew:-48.89  LR:0.00085985   Polyak:0.99990  Bf:100  EN:0.2262  Loss: 3.421 0.140 0.140
Ep:3265   Rew:-16.86  Avg Rew:-49.06  LR:0.00085966   Polyak:0.99990  Bf:100  EN:0.2262  Loss: 3

In [None]:
test()