In [1]:
import os
import time
import torch
import gym
import numpy as np
from gym import wrappers
from PIL import Image

from TD3.td3 import TD3
from TD3.utils import ReplayBuffer, mkdir

In [2]:
env_name = 'BipedalWalker-v2'
lr_base = 0.0005
lr_decay = 0.0001
exp_noise_base = 0.1
exp_noise_decay = 0.0001

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 1024        # num of transitions sampled from replay buffer
polyak = 0.999              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
log_interval = 10           # print avg reward after interval

In [3]:
actor_config = [
        {'dim': [None, 400], 'dropout': False, 'activation': 'relu'},
        {'dim': [400, 300], 'dropout': False, 'activation':'relu'},
        {'dim': [300, None], 'dropout': False, 'activation': 'sigmoid'}
    ]
    
critic_config = [
        {'dim': [None, 400], 'dropout': False, 'activation': 'relu'},
        {'dim': [400, 300], 'dropout': False , 'activation':'relu'},
        {'dim': [300, 1], 'dropout': False, 'activation': False}
    ]

In [4]:
class TD3Trainer():
    
    def __init__(self, env_name, actor_config, critic_config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 exp_noise_base=0.3, exp_noise_decay=0.0001, gamma=0.99, batch_size=1024, 
                 polyak=0.9999, policy_noise=0.2, noise_clip=0.5, policy_delay=2, 
                 max_episodes=100000, max_timesteps=3000, max_buffer_length=5000000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, exp_noise_minimum=1e-10,
                 record_videos=True, record_interval=100):        
        
        self.algorithm_name = 'td3'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_low = self.env.action_space.low
        self.action_high = self.env.action_space.high        
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.actor_config[0]['dim'][0] = self.state_dim
        self.actor_config[-1]['dim'][1] = self.action_dim
        self.critic_config[0]['dim'][0] = self.state_dim + self.action_dim
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay   
        self.lr_minimum = lr_minimum
        self.exp_noise_base = exp_noise_base
        self.exp_noise_decay = exp_noise_decay     
        self.exp_noise_minimum = exp_noise_minimum                
        self.gamma = gamma
        self.batch_size = batch_size        
        self.polyak = polyak
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_delay = policy_delay
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.max_buffer_length = max_buffer_length
        self.log_interval = log_interval
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
                
        self.policy = TD3(self.actor_config, self.critic_config, self.action_low, self.action_high)   
        self.replay_buffer = ReplayBuffer(max_length=self.max_buffer_length)
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
        
    def train(self):
        
        start_time = time.time()
        print("Training started ... \n")
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={}".format(self.threshold))     
        print("action_low={} action_high={} \n".format(self.action_low, self.action_high))         

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")

        # training procedure:
        for episode in range(1, self.max_episodes+1):
            
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
            
            ep_reward = 0.0
            state = self.env.reset()
            
            # calculate params
            exploration_noise = max(self.exp_noise_base / (1.0 + episode * self.exp_noise_decay), self.exp_noise_minimum)
            learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum)            
            self.policy.set_optimizers(lr=learning_rate)

            for t in range(self.max_timesteps):
                
                # select action and add exploration noise:
                action = self.policy.select_action(state)               
                action = action + np.random.normal(0, exploration_noise, size=self.action_dim)
                action = action.clip(self.action_low, self.action_high)

                # take action in env:
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.add((state, action, reward, next_state, float(done)))
                state = next_state

                ep_reward += reward

                # if episode is done then update policy:
                if done or t==(self.max_timesteps-1):
                    self.policy.update(self.replay_buffer, t, self.batch_size, self.gamma, self.polyak, 
                                       self.policy_noise, self.noise_clip, self.policy_delay)
                    break

            self.reward_history.append(ep_reward)
            avg_reward = np.mean(self.reward_history[-100:]) 

            # logging updates:        
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
            
            # Calculate polyak
            #part = (env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150)
            #if part > 1:
            #    part = 1
            #polyak = polyak_int[0] + (1 - part) * (polyak_int[1] - polyak_int[0])     

            # Calculate LR
            #part = min((env.spec.reward_threshold - avg_reward) / (env.spec.reward_threshold + 150), 1)
                        
            avg_actor_loss = np.mean(self.policy.actor_loss_list[-100:])
            avg_Q1_loss = np.mean(self.policy.Q1_loss_list[-100:])
            avg_Q2_loss = np.mean(self.policy.Q2_loss_list[-100:])

            if not self.make_plots and len(self.policy.actor_loss_list) > 200:
                self.policy.actor_loss_list.pop(0)
                self.policy.Q1_loss_list.pop(0)
                self.policy.Q2_loss_list.pop(0)  
                self.reward_history.pop(0)    

            # Print avg reward every log interval:
            if episode % self.log_interval == 0:            
                self.policy.save(self.directory, self.filename)
                print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.polyak, self.replay_buffer.get_fill(), 
                    exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))
                
            self.should_record = False    
                
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold and episode > 100: 
                print("Ep:{}   Rew:{:3.2f}  Avg Rew:{:3.2f}  LR:{:8.8f}   Polyak:{:5.5f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.polyak, self.replay_buffer.get_fill(), 
                    exploration_noise, avg_actor_loss, avg_Q1_loss, avg_Q2_loss))
                print("########## Solved! ###########")
                name = self.filename + '_solved'
                self.policy.save(self.directory, name)
                log_f.close()
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break    
       
    def test(self, episodes=3, render=True, save_gif=True):   
        
        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)

        for episode in range(1, episodes+1):
            ep_reward = 0.0
            state = self.env.reset()
            epdir = mkdir(algdir, str(episode))
            
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state)
                state, reward, done, _ = self.env.step(action)
                ep_reward += reward
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    

            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
            

In [5]:
agent = TD3Trainer(env_name, actor_config, critic_config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   exp_noise_base=exp_noise_base, exp_noise_decay=exp_noise_decay, gamma=gamma, batch_size=batch_size,
                   polyak=polyak, policy_noise=policy_noise, noise_clip=noise_clip, policy_delay=policy_delay, 
                   max_episodes=max_episodes, max_timesteps=max_timesteps, max_buffer_length=max_buffer_length, 
                   log_interval=log_interval)
agent.train()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=400, bias=True)
  (1): ReLU()
  (2): Linear(in_features=400, out_features=300, bias=True)
  (3): ReLU()
  (4): Linear(in_features=300, out_features=4, bias=True)
  (5): Sigmoid()
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=400, bias=True)
  (1): ReLU()
  (2): Linear(in_features=400, out_features=300, bias=True)
  (3): ReLU()
  (4): Linear(in_features=300, out_features=4, bias=True)
  (5): Sigmoid()
)
CRITIC=Sequential(
  (0): Linear(in_features=28, out_features=400, bias=True)
  (1): ReLU()
  (2): Linear(in_features=400, out_features=300, bias=True)
  (3): ReLU()
  (4): Linear(in_features=300, out_features=1, bias=True)
)
CRITIC=Sequential(
  (0): Linear(in_features=28, out_features=400, bias=Tr

Ep:580   Rew:-121.84  Avg Rew:-121.84  LR:0.00047259   Polyak:0.99900  Bf: 8  EN:0.0945  Loss: 5.301 1.553 1.480
Ep:590   Rew:-111.87  Avg Rew:-111.87  LR:0.00047214   Polyak:0.99900  Bf: 8  EN:0.0944  Loss: 5.319 1.585 1.558
Ep:600   Rew:-121.38  Avg Rew:-121.38  LR:0.00047170   Polyak:0.99900  Bf: 8  EN:0.0943  Loss: 5.275 1.862 1.735
Ep:610   Rew:-122.05  Avg Rew:-122.05  LR:0.00047125   Polyak:0.99900  Bf: 8  EN:0.0943  Loss: 5.356 1.716 1.721
Ep:620   Rew:-115.97  Avg Rew:-115.97  LR:0.00047081   Polyak:0.99900  Bf: 8  EN:0.0942  Loss: 5.333 1.902 1.781
Ep:630   Rew:-120.49  Avg Rew:-120.49  LR:0.00047037   Polyak:0.99900  Bf: 8  EN:0.0941  Loss: 5.306 1.614 1.479
Ep:640   Rew:-122.26  Avg Rew:-122.26  LR:0.00046992   Polyak:0.99900  Bf: 8  EN:0.0940  Loss: 5.269 1.731 1.669
Ep:650   Rew:-119.16  Avg Rew:-119.16  LR:0.00046948   Polyak:0.99900  Bf: 8  EN:0.0939  Loss: 5.252 1.713 1.651
Ep:660   Rew:-124.08  Avg Rew:-124.08  LR:0.00046904   Polyak:0.99900  Bf: 8  EN:0.0938  Loss: 5

KeyboardInterrupt: 

In [None]:
agent.test()