In [1]:
import os
import time
import torch
import gym
import numpy as np
from gym import wrappers
from PIL import Image

from TD3.td3 import TD3
from TD3.utils import ReplayBuffer, mkdir

In [2]:
env_name = 'BipedalWalkerHardcore-v2'
lr_base = 0.001
lr_decay = 0.0001
exp_noise_base = 0.4 
exp_noise_decay = 0.0003

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
polyak = 0.9999              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 1000000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
log_interval = 10           # print avg reward after interval

In [3]:
actor_config = [
        {'dim': [None, 256], 'dropout': False, 'activation': 'relu'},
        {'dim': [256, 256], 'dropout': True, 'activation':'relu'},
        {'dim': [256, 128], 'dropout': False, 'activation': 'relu'},       
        {'dim': [128, None],'dropout': False, 'activation': 'sigmoid'}
    ]
    
critic_config = [
        {'dim': [None, 512], 'dropout': False, 'activation': 'relu'},
        {'dim': [512, 512], 'dropout': False , 'activation':'relu'},
        {'dim': [512, 128], 'dropout': False, 'activation': 'relu'},       
        {'dim': [128, 1], 'dropout': False, 'activation': False},
    ]

In [4]:
class TD3Trainer():
    
    def __init__(self, env_name, actor_config, critic_config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 exp_noise_base=0.3, exp_noise_decay=0.0001, gamma=0.99, batch_size=1024, 
                 polyak=0.9999, policy_noise=0.2, noise_clip=0.5, policy_delay=2, 
                 max_episodes=100000, max_timesteps=3000, max_buffer_length=5000000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, exp_noise_minimum=1e-10,
                 record_videos=True, record_interval=100):        
        
        self.algorithm_name = 'td3'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_low = self.env.action_space.low
        self.action_high = self.env.action_space.high        
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.actor_config[0]['dim'][0] = self.state_dim
        self.actor_config[-1]['dim'][1] = self.action_dim
        self.critic_config[0]['dim'][0] = self.state_dim + self.action_dim
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay   
        self.lr_minimum = lr_minimum
        self.exp_noise_base = exp_noise_base
        self.exp_noise_decay = exp_noise_decay     
        self.exp_noise_minimum = exp_noise_minimum                
        self.gamma = gamma
        self.batch_size = batch_size        
        self.polyak = polyak
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.max_buffer_length = max_buffer_length
        self.log_interval = log_interval
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
                
        self.policy = TD3(self.actor_config, self.critic_config, self.action_low, self.action_high)   
        self.replay_buffer = ReplayBuffer(max_length=self.max_buffer_length)
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
        
    def train(self):
        
        start_time = time.time()
        print("Training started ... \n")
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={}".format(self.threshold))     
        print("action_low={} action_high={} \n".format(self.action_low, self.action_high))         

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")

        # training procedure:
        for episode in range(1, self.max_episodes+1):
            
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
            
            ep_reward = 0.0
            state = self.env.reset()
            
            # calculate params
            exploration_noise = max(self.exp_noise_base / (1.0 + episode * self.exp_noise_decay), self.exp_noise_minimum)
            learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum)            
            self.policy.set_optimizers(lr=learning_rate)

            for t in range(self.max_timesteps):
                
                # select action and add exploration noise:
                action = self.policy.select_action(state)
                action = action + np.random.normal(0, exploration_noise, size=self.action_dim)
                action = action.clip(self.action_low, self.action_high)

                # take action in env:
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.add((state, action, reward, next_state, float(done)))
                state = next_state

                ep_reward += reward

                # if episode is done then update policy:
                if done or t==(self.max_timesteps-1):
                    self.policy.update(self.replay_buffer, t, self.batch_size, self.gamma, self.polyak, 
                                       self.policy_noise, self.noise_clip, self.policy_delay)
                    break

            self.reward_history.append(ep_reward)
            avg_reward = np.mean(self.reward_history[-100:]) 

            # logging updates:        
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
            
            # Calculate polyak
            #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
            #if part > 1:
            #    part = 1
            #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     

            # Calculate LR
            #part = min((env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150), 1)
                        
            avg_actor_loss = np.mean(self.policy.actor_loss_list[-100:])
            avg_Q1_loss = np.mean(self.policy.Q1_loss_list[-100:])
            avg_Q2_loss = np.mean(self.policy.Q2_loss_list[-100:])

            if not self.make_plots and len(self.policy.actor_loss_list) > 200:
                self.policy.actor_loss_list.pop(0)
                self.policy.Q1_loss_list.pop(0)
                self.policy.Q2_loss_list.pop(0)  
                self.reward_history.pop(0)    

            # Print avg reward every log interval:
            if episode % self.log_interval == 0:            
                self.policy.save(self.directory, self.filename)
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))
                
            self.should_record = False    
                
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold and episode > 100: 
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))
                print("########## Solved! ###########")
                name = self.filename + '_solved'
                self.policy.save(self.directory, name)
                log_f.close()
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break    
       
    def test(self, episodes=3, render=True, save_gif=True):   
        
        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)

        for episode in range(1, episodes+1):
            ep_reward = 0.0
            state = self.env.reset()
            epdir = mkdir(algdir, str(episode))
            
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state)
                state, reward, done, _ = self.env.step(action)
                ep_reward += reward
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    

            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
            

In [None]:
agent = TD3Trainer(env_name, actor_config, critic_config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   exp_noise_base=exp_noise_base, exp_noise_decay=exp_noise_decay, gamma=gamma, batch_size=batch_size,
                   polyak=polyak, policy_noise=policy_noise, noise_clip=noise_clip, policy_delay=policy_delay, 
                   max_episodes=max_episodes, max_timesteps=max_timesteps, max_buffer_length=max_buffer_length, 
                   log_interval=log_interval)
agent.train()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): ReLU()
  (7): Linear(in_features=128, out_features=4, bias=True)
  (8): Sigmoid()
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=256, bias=True)
  (3): Dropout(p=0.2)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): ReLU()
  (7): Linear(in_features=128, out_features=4, bias=True)
  (8): Sigmoid()
)
CRITIC=Sequential(
  (0): Linear(in_features=28, out_features=512, bias=True)
  (1): ReLU()
  (2): Linear(in

Ep:530   Rew:-107.53  Avg Rew:-109.11  LR:0.00094967   Polyak:0.99990  Bf: 3  EN:0.3451  Loss: 0.197 0.664 0.659
Ep:540   Rew:-104.49  Avg Rew:-103.56  LR:0.00094877   Polyak:0.99990  Bf: 3  EN:0.3442  Loss: 0.212 0.761 0.794
Ep:550   Rew:-109.42  Avg Rew:-109.06  LR:0.00094787   Polyak:0.99990  Bf: 3  EN:0.3433  Loss: 0.216 1.100 1.041
Ep:560   Rew:-104.31  Avg Rew:-103.58  LR:0.00094697   Polyak:0.99990  Bf: 3  EN:0.3425  Loss: 0.279 0.863 1.052
Ep:570   Rew:-135.27  Avg Rew:-123.68  LR:0.00094607   Polyak:0.99990  Bf: 4  EN:0.3416  Loss: 0.493 0.659 0.674
Ep:580   Rew:-127.14  Avg Rew:-113.27  LR:0.00094518   Polyak:0.99990  Bf: 4  EN:0.3407  Loss: 0.576 0.638 0.803
Ep:590   Rew:-144.38  Avg Rew:-127.07  LR:0.00094429   Polyak:0.99990  Bf: 4  EN:0.3398  Loss: 0.684 0.478 0.519
Ep:600   Rew:-107.72  Avg Rew:-103.34  LR:0.00094340   Polyak:0.99990  Bf: 4  EN:0.3390  Loss: 0.751 1.001 0.890
Ep:610   Rew:-151.72  Avg Rew:-126.83  LR:0.00094251   Polyak:0.99990  Bf: 4  EN:0.3381  Loss: 0

Ep:1260   Rew:-112.79  Avg Rew:-128.56  LR:0.00088810   Polyak:0.99990  Bf: 7  EN:0.2903  Loss: 3.242 1.033 1.107
Ep:1270   Rew:-111.75  Avg Rew:-112.10  LR:0.00088731   Polyak:0.99990  Bf: 7  EN:0.2896  Loss: 3.175 0.955 1.016
Ep:1280   Rew:-108.54  Avg Rew:-107.80  LR:0.00088652   Polyak:0.99990  Bf: 7  EN:0.2890  Loss: 3.246 0.667 0.753
Ep:1290   Rew:-105.40  Avg Rew:-105.74  LR:0.00088574   Polyak:0.99990  Bf: 8  EN:0.2884  Loss: 3.364 0.901 0.951
Ep:1300   Rew:-102.80  Avg Rew:-116.09  LR:0.00088496   Polyak:0.99990  Bf: 8  EN:0.2878  Loss: 3.353 0.895 0.932
Ep:1310   Rew:-92.21  Avg Rew:-108.70  LR:0.00088417   Polyak:0.99990  Bf: 8  EN:0.2872  Loss: 3.426 0.863 0.866
Ep:1320   Rew:-105.88  Avg Rew:-105.02  LR:0.00088339   Polyak:0.99990  Bf: 8  EN:0.2865  Loss: 3.449 1.056 1.060
Ep:1330   Rew:-104.24  Avg Rew:-107.37  LR:0.00088261   Polyak:0.99990  Bf: 8  EN:0.2859  Loss: 3.512 0.894 0.916
Ep:1340   Rew:-107.67  Avg Rew:-117.69  LR:0.00088183   Polyak:0.99990  Bf: 8  EN:0.2853 

Ep:1990   Rew:-84.90  Avg Rew:-106.46  LR:0.00083403   Polyak:0.99990  Bf:10  EN:0.2505  Loss: 6.278 1.356 1.494
Ep:2000   Rew:-101.34  Avg Rew:-99.34  LR:0.00083333   Polyak:0.99990  Bf:10  EN:0.2500  Loss: 6.084 1.381 1.461
Ep:2010   Rew:-92.30  Avg Rew:-95.89  LR:0.00083264   Polyak:0.99990  Bf:10  EN:0.2495  Loss: 6.416 1.174 1.299
Ep:2020   Rew:-91.46  Avg Rew:-87.24  LR:0.00083195   Polyak:0.99990  Bf:10  EN:0.2491  Loss: 6.219 1.460 1.637
Ep:2030   Rew:-83.67  Avg Rew:-90.29  LR:0.00083126   Polyak:0.99990  Bf:10  EN:0.2486  Loss: 6.327 1.423 1.623
Ep:2040   Rew:-95.36  Avg Rew:-89.00  LR:0.00083056   Polyak:0.99990  Bf:10  EN:0.2481  Loss: 6.333 1.464 1.584
Ep:2050   Rew:-98.25  Avg Rew:-106.76  LR:0.00082988   Polyak:0.99990  Bf:10  EN:0.2477  Loss: 6.402 1.365 1.479
Ep:2060   Rew:-102.88  Avg Rew:-101.64  LR:0.00082919   Polyak:0.99990  Bf:10  EN:0.2472  Loss: 6.474 1.472 1.592
Ep:2070   Rew:-113.85  Avg Rew:-123.44  LR:0.00082850   Polyak:0.99990  Bf:10  EN:0.2468  Loss: 6.4

Ep:2720   Rew:-119.37  Avg Rew:-122.81  LR:0.00078616   Polyak:0.99990  Bf:16  EN:0.2203  Loss: 9.179 2.087 2.168
Ep:2730   Rew:-124.82  Avg Rew:-121.42  LR:0.00078555   Polyak:0.99990  Bf:16  EN:0.2199  Loss: 9.288 2.094 2.220
Ep:2740   Rew:-86.94  Avg Rew:-101.64  LR:0.00078493   Polyak:0.99990  Bf:16  EN:0.2195  Loss: 9.303 2.035 2.137
Ep:2750   Rew:-147.99  Avg Rew:-123.92  LR:0.00078431   Polyak:0.99990  Bf:16  EN:0.2192  Loss: 9.275 2.212 2.284
Ep:2760   Rew:-96.66  Avg Rew:-95.41  LR:0.00078370   Polyak:0.99990  Bf:16  EN:0.2188  Loss: 9.195 1.964 2.124
Ep:2770   Rew:-143.65  Avg Rew:-130.08  LR:0.00078309   Polyak:0.99990  Bf:16  EN:0.2185  Loss: 9.213 2.024 2.182
Ep:2780   Rew:-96.67  Avg Rew:-97.46  LR:0.00078247   Polyak:0.99990  Bf:16  EN:0.2181  Loss: 9.352 2.201 2.291
Ep:2790   Rew:-81.21  Avg Rew:-74.64  LR:0.00078186   Polyak:0.99990  Bf:17  EN:0.2177  Loss: 9.471 2.122 2.242
Ep:2800   Rew:-120.50  Avg Rew:-97.94  LR:0.00078125   Polyak:0.99990  Bf:17  EN:0.2174  Loss: 

Ep:3450   Rew:-74.72  Avg Rew:-67.16  LR:0.00074349   Polyak:0.99990  Bf:27  EN:0.1966  Loss: 11.423 2.905 2.884
Ep:3460   Rew:-121.80  Avg Rew:-119.72  LR:0.00074294   Polyak:0.99990  Bf:28  EN:0.1963  Loss: 11.349 3.086 2.886
Ep:3470   Rew:-103.62  Avg Rew:-92.62  LR:0.00074239   Polyak:0.99990  Bf:28  EN:0.1960  Loss: 11.335 3.115 3.070
Ep:3480   Rew:-116.52  Avg Rew:-121.64  LR:0.00074184   Polyak:0.99990  Bf:28  EN:0.1957  Loss: 11.577 2.947 3.001
Ep:3490   Rew:-26.23  Avg Rew:-65.54  LR:0.00074129   Polyak:0.99990  Bf:28  EN:0.1954  Loss: 11.283 2.955 2.910
Ep:3500   Rew:-141.61  Avg Rew:-126.71  LR:0.00074074   Polyak:0.99990  Bf:29  EN:0.1951  Loss: 11.439 3.120 3.207
Ep:3510   Rew:-85.94  Avg Rew:-95.52  LR:0.00074019   Polyak:0.99990  Bf:29  EN:0.1948  Loss: 11.403 2.903 2.960
Ep:3520   Rew:-117.37  Avg Rew:-138.41  LR:0.00073964   Polyak:0.99990  Bf:29  EN:0.1946  Loss: 11.458 2.855 2.819
Ep:3530   Rew:-116.67  Avg Rew:-127.85  LR:0.00073910   Polyak:0.99990  Bf:29  EN:0.194

Ep:4170   Rew:-105.52  Avg Rew:-106.35  LR:0.00070572   Polyak:0.99990  Bf:52  EN:0.1777  Loss: 10.976 1.929 1.956
Ep:4180   Rew:-109.72  Avg Rew:-73.98  LR:0.00070522   Polyak:0.99990  Bf:53  EN:0.1775  Loss: 10.999 1.924 1.975
Ep:4190   Rew:-124.22  Avg Rew:-116.64  LR:0.00070472   Polyak:0.99990  Bf:53  EN:0.1772  Loss: 10.999 1.867 1.882
Ep:4200   Rew:-96.36  Avg Rew:-101.64  LR:0.00070423   Polyak:0.99990  Bf:54  EN:0.1770  Loss: 10.916 1.869 1.881
Ep:4210   Rew:-117.87  Avg Rew:-114.10  LR:0.00070373   Polyak:0.99990  Bf:54  EN:0.1768  Loss: 10.855 1.856 1.855
Ep:4220   Rew:-77.50  Avg Rew:-79.02  LR:0.00070323   Polyak:0.99990  Bf:54  EN:0.1765  Loss: 10.878 1.800 1.812
Ep:4230   Rew:-90.49  Avg Rew:-103.12  LR:0.00070274   Polyak:0.99990  Bf:55  EN:0.1763  Loss: 10.958 1.787 1.844
Ep:4240   Rew:-94.62  Avg Rew:-94.40  LR:0.00070225   Polyak:0.99990  Bf:55  EN:0.1761  Loss: 10.891 1.728 1.770
Ep:4250   Rew:-108.50  Avg Rew:-99.85  LR:0.00070175   Polyak:0.99990  Bf:55  EN:0.1758

Ep:4890   Rew:-67.93  Avg Rew:-77.30  LR:0.00067159   Polyak:0.99990  Bf:80  EN:0.1621  Loss: 10.449 1.276 1.360
Ep:4900   Rew:-111.37  Avg Rew:-101.58  LR:0.00067114   Polyak:0.99990  Bf:81  EN:0.1619  Loss: 10.437 1.383 1.430
Ep:4910   Rew:-43.40  Avg Rew:-58.62  LR:0.00067069   Polyak:0.99990  Bf:81  EN:0.1617  Loss: 10.420 1.344 1.311
Ep:4920   Rew:-42.96  Avg Rew:-32.67  LR:0.00067024   Polyak:0.99990  Bf:82  EN:0.1616  Loss: 10.362 1.276 1.271
Ep:4930   Rew:-100.36  Avg Rew:-106.51  LR:0.00066979   Polyak:0.99990  Bf:82  EN:0.1614  Loss: 10.343 1.385 1.340
Ep:4940   Rew:-33.83  Avg Rew:-78.19  LR:0.00066934   Polyak:0.99990  Bf:82  EN:0.1612  Loss: 10.345 1.330 1.321
Ep:4950   Rew:-111.69  Avg Rew:-95.60  LR:0.00066890   Polyak:0.99990  Bf:83  EN:0.1610  Loss: 10.404 1.373 1.377
Ep:4960   Rew:-75.40  Avg Rew:-61.86  LR:0.00066845   Polyak:0.99990  Bf:83  EN:0.1608  Loss: 10.318 1.319 1.346


In [None]:
agent.test()