In [None]:
import os
import torch
import gym
import numpy as np
from TD3_torch.TD3 import TD3
from PIL import Image
from TD3_torch.utils import ReplayBuffer

env_name = 'BipedalWalkerHardcore-v2'
learning_rate_base = 0.0001
log_interval = 10           # print avg reward after interval
random_seed = 222
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
exploration_noise = 0.3 
polyak_int = [0.9999, 0.999999]              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 2000000
directory = "./preTrained/td3_torch/{}".format(env_name) # save trained models
filename = "TD3_torch_{}_{}".format(env_name, random_seed)
reward_history = []



def train():
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    polyak = polyak_int[0]
    
    actor_config = [
        {'dim': (state_dim, 256), 'dropout': False, 'activation': 'relu'},
        {'dim': (256, 320), 'dropout': True, 'activation':'relu'},
        {'dim': (320, 160), 'dropout': False, 'activation': 'relu'},
        {'dim': (160, 64), 'dropout': False, 'activation': 'relu'},
        {'dim': (64, action_dim),'dropout': False, 'activation': False}
    ]
    
    policy = TD3(actor_config, state_dim, action_dim, max_action, lr=learning_rate_base)
    replay_buffer = ReplayBuffer(max_length=max_buffer_length)
    
    print("action_space={}".format(env.action_space))
    print("obs_space={}".format(env.observation_space))
    print("threshold={} \n".format(env.spec.reward_threshold))
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        env.seed(random_seed)
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)
    
    # logging variables:        
    log_f = open("log.txt","w+")
    
    # training procedure:
    for episode in range(1, max_episodes+1):
        ep_reward = 0
        state = env.reset()
       
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)
            
            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            state = next_state
            
            ep_reward += reward
            
            # if episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        reward_history.append(ep_reward)
        avg_reward = np.mean(reward_history[-100:]) 
        
        # logging updates:        
        log_f.write('{},{}\n'.format(episode, ep_reward))
        log_f.flush()
       
        
        # if avg reward > 300 then save and stop traning:
        if avg_reward >= env.spec.reward_threshold: 
            print("########## Solved! ###########")
            name = filename + '_solved'
            policy.save(directory, name)
            log_f.close()
            break
            
        # Calculate polyak
        #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        #if part > 1:
        #    part = 1
        #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     
        
        # Calculate LR
        part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
        if part > 1:
            part = 1
        learning_rate = learning_rate_base - learning_rate_base * (1 - part) * 0.9
        policy.set_optimizers(lr=learning_rate)
        
        
        if episode > 500:
            policy.save(directory, filename)
        
        # print avg reward every log interval:
        if episode % log_interval == 0:            
            print("Ep: {}   Rew: {:3.2f}   Avg Rew: {:3.2f}   LR: {:8.8f}   Polyak: {:6.6f}   Bf: {:2.0f}   Loss: {:5.3f}  {:5.3f}  {:5.3f}".format(
                episode, ep_reward, avg_reward, learning_rate, polyak, replay_buffer.get_fill(), policy.actor_loss, policy.loss_Q1, policy.loss_Q2))

train()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=320, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=320, out_features=160, bias=True)
  (6): ReLU()
  (7): Linear(in_features=160, out_features=64, bias=True)
  (8): ReLU()
  (9): Linear(in_features=64, out_features=4, bias=True)
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=320, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=320, out_features=160, bias=True)
  (6): ReLU()
  (7): Linear(in_features=160, out_features=64, bias=True)
  (8): ReLU()
  (9): Linear(in_features=64, out_features=4, bias=True)


Ep: 630   Rew: -105.76   Avg Rew: -108.65   LR: 0.00009173   Polyak: 0.999900   Bf:  6   Loss: 1.822  2.736  2.159
Ep: 640   Rew: -103.47   Avg Rew: -107.20   LR: 0.00009144   Polyak: 0.999900   Bf:  6   Loss: 1.640  2.279  1.663
Ep: 650   Rew: -102.44   Avg Rew: -105.77   LR: 0.00009115   Polyak: 0.999900   Bf:  6   Loss: 1.005  2.655  0.995
Ep: 660   Rew: -104.87   Avg Rew: -107.54   LR: 0.00009151   Polyak: 0.999900   Bf:  6   Loss: 1.712  1.809  1.541
Ep: 670   Rew: -119.51   Avg Rew: -109.03   LR: 0.00009181   Polyak: 0.999900   Bf:  6   Loss: 2.294  2.838  2.542
Ep: 680   Rew: -101.51   Avg Rew: -108.84   LR: 0.00009177   Polyak: 0.999900   Bf:  7   Loss: 2.418  2.038  2.110
Ep: 690   Rew: -102.68   Avg Rew: -108.65   LR: 0.00009173   Polyak: 0.999900   Bf:  7   Loss: 2.219  4.342  2.333
Ep: 700   Rew: -95.90   Avg Rew: -108.60   LR: 0.00009172   Polyak: 0.999900   Bf:  7   Loss: 3.081  4.429  4.574
Ep: 710   Rew: -114.22   Avg Rew: -110.36   LR: 0.00009207   Polyak: 0.999900   B

Ep: 1340   Rew: -129.18   Avg Rew: -118.52   LR: 0.00009370   Polyak: 0.999900   Bf: 10   Loss: 5.329  12.295  11.557
Ep: 1350   Rew: -119.32   Avg Rew: -117.88   LR: 0.00009358   Polyak: 0.999900   Bf: 10   Loss: 5.158  1.930  2.035
Ep: 1360   Rew: -135.11   Avg Rew: -117.68   LR: 0.00009354   Polyak: 0.999900   Bf: 10   Loss: 5.453  1.896  1.993
Ep: 1370   Rew: -142.83   Avg Rew: -117.51   LR: 0.00009350   Polyak: 0.999900   Bf: 10   Loss: 5.873  2.175  2.749
Ep: 1380   Rew: -116.84   Avg Rew: -117.45   LR: 0.00009349   Polyak: 0.999900   Bf: 10   Loss: 4.258  1.944  1.891
Ep: 1390   Rew: -121.38   Avg Rew: -117.03   LR: 0.00009341   Polyak: 0.999900   Bf: 10   Loss: 5.769  3.126  2.849
Ep: 1400   Rew: -104.05   Avg Rew: -116.80   LR: 0.00009336   Polyak: 0.999900   Bf: 10   Loss: 5.687  2.481  2.436
Ep: 1410   Rew: -120.33   Avg Rew: -116.65   LR: 0.00009333   Polyak: 0.999900   Bf: 10   Loss: 5.734  3.087  3.230
Ep: 1420   Rew: -115.84   Avg Rew: -115.97   LR: 0.00009319   Polyak: 

Ep: 2050   Rew: -93.03   Avg Rew: -116.03   LR: 0.00009321   Polyak: 0.999900   Bf: 13   Loss: 8.732  4.364  4.195
Ep: 2060   Rew: -125.11   Avg Rew: -116.30   LR: 0.00009326   Polyak: 0.999900   Bf: 14   Loss: 8.495  2.491  2.580
Ep: 2070   Rew: -115.87   Avg Rew: -115.82   LR: 0.00009316   Polyak: 0.999900   Bf: 14   Loss: 9.133  2.201  1.879
Ep: 2080   Rew: -110.69   Avg Rew: -116.11   LR: 0.00009322   Polyak: 0.999900   Bf: 14   Loss: 10.383  3.012  2.491
Ep: 2090   Rew: -119.58   Avg Rew: -116.69   LR: 0.00009334   Polyak: 0.999900   Bf: 14   Loss: 10.070  2.300  2.108
Ep: 2100   Rew: -92.86   Avg Rew: -116.92   LR: 0.00009338   Polyak: 0.999900   Bf: 14   Loss: 10.578  2.358  2.166
Ep: 2110   Rew: -119.13   Avg Rew: -117.59   LR: 0.00009352   Polyak: 0.999900   Bf: 14   Loss: 10.507  3.231  2.930
Ep: 2120   Rew: -91.20   Avg Rew: -117.60   LR: 0.00009352   Polyak: 0.999900   Bf: 14   Loss: 8.084  2.268  2.374
Ep: 2130   Rew: -111.64   Avg Rew: -117.41   LR: 0.00009348   Polyak: 0

Ep: 2760   Rew: -96.99   Avg Rew: -117.67   LR: 0.00009353   Polyak: 0.999900   Bf: 18   Loss: 13.835  2.245  1.957
Ep: 2770   Rew: -132.36   Avg Rew: -117.56   LR: 0.00009351   Polyak: 0.999900   Bf: 18   Loss: 11.546  2.423  3.228
Ep: 2780   Rew: -93.54   Avg Rew: -117.25   LR: 0.00009345   Polyak: 0.999900   Bf: 18   Loss: 13.256  2.132  2.516
Ep: 2790   Rew: -122.10   Avg Rew: -116.37   LR: 0.00009327   Polyak: 0.999900   Bf: 18   Loss: 11.870  1.800  1.896
Ep: 2800   Rew: -94.54   Avg Rew: -117.61   LR: 0.00009352   Polyak: 0.999900   Bf: 18   Loss: 14.006  1.342  1.876
Ep: 2810   Rew: -107.04   Avg Rew: -116.12   LR: 0.00009322   Polyak: 0.999900   Bf: 18   Loss: 12.583  2.439  2.336
Ep: 2820   Rew: -125.21   Avg Rew: -115.27   LR: 0.00009305   Polyak: 0.999900   Bf: 18   Loss: 14.359  3.165  3.457
Ep: 2830   Rew: -121.43   Avg Rew: -115.81   LR: 0.00009316   Polyak: 0.999900   Bf: 19   Loss: 15.727  3.366  3.107
Ep: 2840   Rew: -96.74   Avg Rew: -115.71   LR: 0.00009314   Polyak

Ep: 3470   Rew: -92.77   Avg Rew: -87.46   LR: 0.00008749   Polyak: 0.999900   Bf: 24   Loss: 14.440  2.387  2.315
Ep: 3480   Rew: -91.50   Avg Rew: -87.42   LR: 0.00008748   Polyak: 0.999900   Bf: 25   Loss: 14.003  3.216  2.537
Ep: 3490   Rew: -95.04   Avg Rew: -88.44   LR: 0.00008769   Polyak: 0.999900   Bf: 25   Loss: 14.404  2.540  2.328
Ep: 3500   Rew: -91.72   Avg Rew: -88.63   LR: 0.00008773   Polyak: 0.999900   Bf: 25   Loss: 14.718  2.020  2.308
Ep: 3510   Rew: -84.79   Avg Rew: -88.33   LR: 0.00008767   Polyak: 0.999900   Bf: 25   Loss: 15.266  2.649  2.526
Ep: 3520   Rew: -86.55   Avg Rew: -88.25   LR: 0.00008765   Polyak: 0.999900   Bf: 25   Loss: 14.678  3.637  3.048
Ep: 3530   Rew: -81.59   Avg Rew: -87.69   LR: 0.00008754   Polyak: 0.999900   Bf: 25   Loss: 15.081  2.878  2.948
Ep: 3540   Rew: -81.77   Avg Rew: -87.06   LR: 0.00008741   Polyak: 0.999900   Bf: 25   Loss: 14.909  3.215  3.501
Ep: 3550   Rew: -64.85   Avg Rew: -86.18   LR: 0.00008724   Polyak: 0.999900   B

Ep: 4190   Rew: -45.23   Avg Rew: -73.68   LR: 0.00008474   Polyak: 0.999900   Bf: 35   Loss: 15.354  1.737  1.904
Ep: 4200   Rew: -36.78   Avg Rew: -74.24   LR: 0.00008485   Polyak: 0.999900   Bf: 36   Loss: 16.141  2.197  2.415
Ep: 4210   Rew: -91.68   Avg Rew: -73.40   LR: 0.00008468   Polyak: 0.999900   Bf: 36   Loss: 17.254  2.422  2.240
Ep: 4220   Rew: -48.37   Avg Rew: -71.78   LR: 0.00008436   Polyak: 0.999900   Bf: 36   Loss: 14.923  2.673  3.248
Ep: 4230   Rew: -52.58   Avg Rew: -72.99   LR: 0.00008460   Polyak: 0.999900   Bf: 36   Loss: 15.780  3.136  2.250
Ep: 4240   Rew: -153.07   Avg Rew: -73.14   LR: 0.00008463   Polyak: 0.999900   Bf: 36   Loss: 16.147  2.216  2.029
Ep: 4250   Rew: -82.54   Avg Rew: -72.22   LR: 0.00008444   Polyak: 0.999900   Bf: 36   Loss: 18.216  2.728  2.433
Ep: 4260   Rew: -76.84   Avg Rew: -74.45   LR: 0.00008489   Polyak: 0.999900   Bf: 37   Loss: 15.433  2.757  2.332
Ep: 4270   Rew: -50.30   Avg Rew: -75.90   LR: 0.00008518   Polyak: 0.999900   

Ep: 4910   Rew: -85.15   Avg Rew: -51.50   LR: 0.00008030   Polyak: 0.999900   Bf: 49   Loss: 13.610  2.138  2.214
Ep: 4920   Rew: -61.55   Avg Rew: -50.23   LR: 0.00008005   Polyak: 0.999900   Bf: 50   Loss: 16.628  4.253  3.945
Ep: 4930   Rew: -69.82   Avg Rew: -50.07   LR: 0.00008001   Polyak: 0.999900   Bf: 50   Loss: 14.602  3.233  2.990
Ep: 4940   Rew: 99.64   Avg Rew: -54.17   LR: 0.00008083   Polyak: 0.999900   Bf: 50   Loss: 15.517  3.021  3.035
Ep: 4950   Rew: -87.69   Avg Rew: -55.50   LR: 0.00008110   Polyak: 0.999900   Bf: 50   Loss: 16.353  3.061  3.504
Ep: 4960   Rew: -39.32   Avg Rew: -59.37   LR: 0.00008187   Polyak: 0.999900   Bf: 50   Loss: 15.294  3.333  2.456
Ep: 4970   Rew: -111.05   Avg Rew: -57.13   LR: 0.00008143   Polyak: 0.999900   Bf: 51   Loss: 12.722  2.922  3.111
Ep: 4980   Rew: -58.61   Avg Rew: -56.63   LR: 0.00008133   Polyak: 0.999900   Bf: 51   Loss: 16.170  5.379  5.502
Ep: 4990   Rew: 1.54   Avg Rew: -57.59   LR: 0.00008152   Polyak: 0.999900   Bf:

Ep: 5630   Rew: 13.75   Avg Rew: -22.50   LR: 0.00007450   Polyak: 0.999900   Bf: 78   Loss: 13.364  2.950  2.746
Ep: 5640   Rew: -9.64   Avg Rew: -19.75   LR: 0.00007395   Polyak: 0.999900   Bf: 78   Loss: 14.204  2.880  2.836
Ep: 5650   Rew: -86.86   Avg Rew: -18.26   LR: 0.00007365   Polyak: 0.999900   Bf: 79   Loss: 15.640  4.672  4.087
Ep: 5660   Rew: 58.22   Avg Rew: -16.00   LR: 0.00007320   Polyak: 0.999900   Bf: 80   Loss: 13.933  4.270  3.796
Ep: 5670   Rew: -77.86   Avg Rew: -22.97   LR: 0.00007459   Polyak: 0.999900   Bf: 80   Loss: 15.002  4.977  4.416
Ep: 5680   Rew: -92.55   Avg Rew: -26.22   LR: 0.00007524   Polyak: 0.999900   Bf: 81   Loss: 13.558  5.153  4.637
Ep: 5690   Rew: 39.19   Avg Rew: -21.72   LR: 0.00007434   Polyak: 0.999900   Bf: 81   Loss: 14.563  3.858  4.599
Ep: 5700   Rew: 16.95   Avg Rew: -24.95   LR: 0.00007499   Polyak: 0.999900   Bf: 82   Loss: 13.672  3.537  3.306
Ep: 5710   Rew: -108.08   Avg Rew: -21.41   LR: 0.00007428   Polyak: 0.999900   Bf: 8

Ep: 6350   Rew: 99.78   Avg Rew: 29.43   LR: 0.00006411   Polyak: 0.999900   Bf: 100   Loss: 7.548  2.112  1.756
Ep: 6360   Rew: 141.56   Avg Rew: 30.87   LR: 0.00006383   Polyak: 0.999900   Bf: 100   Loss: 6.511  1.171  1.342
Ep: 6370   Rew: 0.25   Avg Rew: 28.17   LR: 0.00006437   Polyak: 0.999900   Bf: 100   Loss: 7.851  1.435  1.327
Ep: 6380   Rew: 230.96   Avg Rew: 18.18   LR: 0.00006636   Polyak: 0.999900   Bf: 100   Loss: 6.904  1.972  2.271
Ep: 6390   Rew: -74.91   Avg Rew: 15.85   LR: 0.00006683   Polyak: 0.999900   Bf: 100   Loss: 7.196  1.882  1.577
Ep: 6400   Rew: -41.18   Avg Rew: 16.05   LR: 0.00006679   Polyak: 0.999900   Bf: 100   Loss: 6.518  2.241  2.544
Ep: 6410   Rew: -83.93   Avg Rew: 23.03   LR: 0.00006539   Polyak: 0.999900   Bf: 100   Loss: 6.362  1.778  1.232
Ep: 6420   Rew: -59.21   Avg Rew: 28.16   LR: 0.00006437   Polyak: 0.999900   Bf: 100   Loss: 7.761  1.685  1.805
Ep: 6430   Rew: 110.61   Avg Rew: 27.16   LR: 0.00006457   Polyak: 0.999900   Bf: 100   Los

In [None]:
def test():  
    random_seed = 0
    n_episodes = 3
    max_timesteps = 2000
    render = True
    save_gif = True
    
    filename = "TD3_torch_{}_{}".format(env_name, random_seed)
    filename += ''
    directory = "./preTrained/td3_torch/{}".format(env_name)
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    dirname = './gif/td3_torch/{}'.format(ep)
                    if not os.path.isdir(dirname):
                        os.mkdir(dirname)
                    img = env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/td3_torch/{}/{}.jpg'.format(ep,t))
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
                
test()
    
    