In [1]:
import os
import time
import torch
import gym
import numpy as np
from gym import wrappers
from PIL import Image

from TD3.td3 import TD3
from TD3.utils import ReplayBuffer, mkdir

In [2]:
env_name = 'Pendulum-v0'
lr_base = 0.001
lr_decay = 0.0001
exp_noise_base = 0.2 
exp_noise_decay = 0.0001

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
polyak = 0.9999              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
log_interval = 5           # print avg reward after interval
threshold = -140

In [3]:
actor_config = [
        {'dim': [None, 32], 'dropout': False, 'activation': 'relu'},
        {'dim': [32, 32], 'dropout': False, 'activation':'relu'},
        {'dim': [32, None], 'dropout': False, 'activation': 'sigmoid'}
    ]
    
critic_config = [
        {'dim': [None, 32], 'dropout': False, 'activation': 'relu'},
        {'dim': [32, 32], 'dropout': False , 'activation':'relu'},
        {'dim': [32, 1], 'dropout': False, 'activation': False}
    ]

In [4]:
class TD3Trainer():
    
    def __init__(self, env_name, actor_config, critic_config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 exp_noise_base=0.3, exp_noise_decay=0.0001, gamma=0.99, batch_size=1024, 
                 polyak=0.9999, policy_noise=0.2, noise_clip=0.5, policy_delay=2, 
                 max_episodes=100000, max_timesteps=3000, max_buffer_length=5000000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, exp_noise_minimum=1e-10,
                 record_videos=True, record_interval=100):        
        
        self.algorithm_name = 'td3'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_low = self.env.action_space.low
        self.action_high = self.env.action_space.high        
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.actor_config[0]['dim'][0] = self.state_dim
        self.actor_config[-1]['dim'][1] = self.action_dim
        self.critic_config[0]['dim'][0] = self.state_dim + self.action_dim
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay   
        self.lr_minimum = lr_minimum
        self.exp_noise_base = exp_noise_base
        self.exp_noise_decay = exp_noise_decay     
        self.exp_noise_minimum = exp_noise_minimum                
        self.gamma = gamma
        self.batch_size = batch_size        
        self.polyak = polyak
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.max_buffer_length = max_buffer_length
        self.log_interval = log_interval
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
                
        self.policy = TD3(self.actor_config, self.critic_config, self.action_low, self.action_high)   
        self.replay_buffer = ReplayBuffer(max_length=self.max_buffer_length)
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
        
    def train(self):
        
        start_time = time.time()
        print("Training started ... \n")
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={}".format(self.threshold))     
        print("action_low={} action_high={} \n".format(self.action_low, self.action_high))         

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")

        # training procedure:
        for episode in range(1, self.max_episodes+1):
            
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
            
            ep_reward = 0.0
            state = self.env.reset()
            
            # calculate params
            exploration_noise = max(self.exp_noise_base / (1.0 + episode * self.exp_noise_decay), self.exp_noise_minimum)
            learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum)            
            self.policy.set_optimizers(lr=learning_rate)

            for t in range(self.max_timesteps):
                
                # select action and add exploration noise:
                action = self.policy.select_action(state)
                action = action + np.random.normal(0, exploration_noise, size=self.action_dim)
                action = action.clip(self.action_low, self.action_high)

                # take action in env:
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.add((state, action, reward, next_state, float(done)))
                state = next_state

                ep_reward += reward

                # if episode is done then update policy:
                if done or t==(self.max_timesteps-1):
                    self.policy.update(self.replay_buffer, t, self.batch_size, self.gamma, self.polyak, 
                                       self.policy_noise, self.noise_clip, self.policy_delay)
                    break

            self.reward_history.append(ep_reward)
            avg_reward = np.mean(self.reward_history[-100:]) 

            # logging updates:        
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
            
            # Calculate polyak
            #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
            #if part > 1:
            #    part = 1
            #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     

            # Calculate LR
            #part = min((env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150), 1)
                        
            avg_actor_loss = np.mean(self.policy.actor_loss_list[-100:])
            avg_Q1_loss = np.mean(self.policy.Q1_loss_list[-100:])
            avg_Q2_loss = np.mean(self.policy.Q2_loss_list[-100:])

            if not self.make_plots and len(self.policy.actor_loss_list) > 200:
                self.policy.actor_loss_list.pop(0)
                self.policy.Q1_loss_list.pop(0)
                self.policy.Q2_loss_list.pop(0)  
                self.reward_history.pop(0)    

            # Print avg reward every log interval:
            if episode % self.log_interval == 0 and episode > 100:            
                self.policy.save(self.directory, self.filename)
                print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.polyak, self.replay_buffer.get_fill(), 
                    exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))
                
            self.should_record = False    
                
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold: 
                print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.polyak, self.replay_buffer.get_fill(), 
                    exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))
                print("########## Solved! ###########")
                name = self.filename + '_solved'
                self.policy.save(self.directory, name)
                log_f.close()
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break    
       
    def test(self, episodes=3, render=True, save_gif=True):   
        
        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)

        for episode in range(1, episodes+1):
            ep_reward = 0.0
            state = self.env.reset()
            epdir = mkdir(algdir, str(episode))
            
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state)
                state, reward, done, _ = self.env.step(action)
                ep_reward += reward
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    

            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
            

In [5]:
agent = TD3Trainer(env_name, actor_config, critic_config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   exp_noise_base=exp_noise_base, exp_noise_decay=exp_noise_decay, gamma=gamma, batch_size=batch_size,
                   polyak=polyak, policy_noise=policy_noise, noise_clip=noise_clip, policy_delay=policy_delay, 
                   max_episodes=max_episodes, max_timesteps=max_timesteps, max_buffer_length=max_buffer_length, 
                   log_interval=log_interval, threshold=threshold)
agent.train()

ACTOR=Sequential(
  (0): Linear(in_features=3, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
  (5): Sigmoid()
)
ACTOR=Sequential(
  (0): Linear(in_features=3, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
  (5): Sigmoid()
)
CRITIC=Sequential(
  (0): Linear(in_features=4, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
)
CRITIC=Sequential(
  (0): Linear(in_features=4, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=1, bias=True)
)
CRITIC=Sequential(
  (0): Linear(in_features=4, out_features=32, bias=True)
  (1): R

Ep:295   Rew:-1513.52  Avg Rew:-1455.40  LR:0.00097135   Polyak:0.99990  Bf: 1  EN:0.1943  Loss: 22.727 1.190 1.192
Ep:300   Rew:-1500.38  Avg Rew:-1464.33  LR:0.00097087   Polyak:0.99990  Bf: 1  EN:0.1942  Loss: 23.024 1.189 1.190
Ep:305   Rew:-1518.37  Avg Rew:-1444.61  LR:0.00097040   Polyak:0.99990  Bf: 1  EN:0.1941  Loss: 23.337 1.243 1.244
Ep:310   Rew:-1517.06  Avg Rew:-1472.46  LR:0.00096993   Polyak:0.99990  Bf: 1  EN:0.1940  Loss: 23.671 1.238 1.240
Ep:315   Rew:-1500.02  Avg Rew:-1508.77  LR:0.00096946   Polyak:0.99990  Bf: 1  EN:0.1939  Loss: 23.951 1.333 1.333
Ep:320   Rew:-1452.79  Avg Rew:-1493.95  LR:0.00096899   Polyak:0.99990  Bf: 1  EN:0.1938  Loss: 24.275 1.602 1.604
Ep:325   Rew:-1347.78  Avg Rew:-1459.95  LR:0.00096852   Polyak:0.99990  Bf: 1  EN:0.1937  Loss: 24.568 1.480 1.479
Ep:330   Rew:-1512.91  Avg Rew:-1493.04  LR:0.00096805   Polyak:0.99990  Bf: 1  EN:0.1936  Loss: 24.855 1.567 1.568
Ep:335   Rew:-1501.34  Avg Rew:-1447.18  LR:0.00096759   Polyak:0.99990 

Ep:650   Rew:-1517.05  Avg Rew:-1517.55  LR:0.00093897   Polyak:0.99990  Bf: 3  EN:0.1878  Loss: 45.119 7.744 7.750
Ep:655   Rew:-1426.14  Avg Rew:-1409.93  LR:0.00093853   Polyak:0.99990  Bf: 3  EN:0.1877  Loss: 45.437 7.681 7.684
Ep:660   Rew:-1451.13  Avg Rew:-1491.05  LR:0.00093809   Polyak:0.99990  Bf: 3  EN:0.1876  Loss: 45.752 7.668 7.671
Ep:665   Rew:-1154.81  Avg Rew:-1358.56  LR:0.00093765   Polyak:0.99990  Bf: 3  EN:0.1875  Loss: 46.079 7.696 7.701
Ep:670   Rew:-1479.24  Avg Rew:-1408.75  LR:0.00093721   Polyak:0.99990  Bf: 3  EN:0.1874  Loss: 46.352 8.159 8.168
Ep:675   Rew:-1368.29  Avg Rew:-1387.54  LR:0.00093677   Polyak:0.99990  Bf: 3  EN:0.1874  Loss: 46.672 8.639 8.646
Ep:680   Rew:-1368.85  Avg Rew:-1388.73  LR:0.00093633   Polyak:0.99990  Bf: 3  EN:0.1873  Loss: 46.991 7.871 7.876
Ep:685   Rew:-965.90  Avg Rew:-1311.48  LR:0.00093589   Polyak:0.99990  Bf: 3  EN:0.1872  Loss: 47.307 8.941 8.947
Ep:690   Rew:-1327.15  Avg Rew:-1363.42  LR:0.00093545   Polyak:0.99990  

Ep:1000   Rew:-1005.54  Avg Rew:-654.77  LR:0.00090909   Polyak:0.99990  Bf: 4  EN:0.1818  Loss: 62.953 17.089 17.091
Ep:1005   Rew:-1048.10  Avg Rew:-994.56  LR:0.00090868   Polyak:0.99990  Bf: 4  EN:0.1817  Loss: 63.065 17.629 17.633
Ep:1010   Rew:-1038.99  Avg Rew:-969.81  LR:0.00090827   Polyak:0.99990  Bf: 4  EN:0.1817  Loss: 63.268 18.414 18.413
Ep:1015   Rew:-668.63  Avg Rew:-532.12  LR:0.00090785   Polyak:0.99990  Bf: 4  EN:0.1816  Loss: 63.498 18.201 18.210
Ep:1020   Rew:-919.94  Avg Rew:-1135.79  LR:0.00090744   Polyak:0.99990  Bf: 4  EN:0.1815  Loss: 63.677 18.530 18.532
Ep:1025   Rew:-955.29  Avg Rew:-636.29  LR:0.00090703   Polyak:0.99990  Bf: 4  EN:0.1814  Loss: 63.816 18.401 18.400
Ep:1030   Rew:-897.37  Avg Rew:-906.09  LR:0.00090662   Polyak:0.99990  Bf: 4  EN:0.1813  Loss: 64.164 17.474 17.476
Ep:1035   Rew:-965.79  Avg Rew:-861.53  LR:0.00090621   Polyak:0.99990  Bf: 4  EN:0.1812  Loss: 64.125 17.271 17.286
Ep:1040   Rew:-929.42  Avg Rew:-1040.24  LR:0.00090580   Pol

In [6]:
agent.test()

Test episode: 1	Reward: -1492.13
Test episode: 2	Reward: -263.43
Test episode: 3	Reward: -520.86
