In [1]:
import gym
import os
import time
import torch
import numpy as np
from gym import wrappers
from PIL import Image
from itertools import count
from collections import namedtuple

from DDPG.ddpg import DDPG
from DDPG.utils import ReplayBuffer, mkdir

In [2]:
env_name = 'BipedalWalker-v2'
lr_base = 0.001
lr_decay = 0.0001
exp_noise_base = 0.5 
exp_noise_decay = 0.002

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 1024           # num of transitions sampled from replay buffer
polyak = 0.999               # target policy update parameter (1-tau)
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
log_interval = 10           # print avg reward after interval

In [3]:
actor_config = [
        {'dim': [None, 64], 'dropout': False, 'activation': 'relu'},
        {'dim': [64, 64], 'dropout': False, 'activation': 'relu'},
        {'dim': [64, None], 'dropout': False, 'activation': 'sigmoid'}
    ]
    
critic_config = [
        {'dim': [None, 64], 'dropout': False, 'activation': 'relu'},
        {'dim': [64, 64], 'dropout': False , 'activation':'relu'},
        {'dim': [64, 1], 'dropout': False, 'activation': False}
    ]

In [4]:
class DDPGTrainer():
    
    def __init__(self, env_name, actor_config, critic_config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 exp_noise_base=0.3, exp_noise_decay=0.0001, exploration_mu=0, exploration_theta=0.15, 
                 exploration_sigma=0.2, gamma=0.99, batch_size=1024, polyak=0.9999,
                 max_episodes=100000, max_timesteps=3000, max_buffer_length=5000000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, exp_noise_minimum=1e-10,
                 record_videos=True, record_interval=100):        
        
        self.algorithm_name = 'ddpg'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_low = self.env.action_space.low
        self.action_high = self.env.action_space.high        
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
        
        self.actor_config = actor_config
        self.critic_config = critic_config
        self.actor_config[0]['dim'][0] = self.state_dim
        self.actor_config[-1]['dim'][1] = self.action_dim
        self.critic_config[0]['dim'][0] = self.state_dim + self.action_dim        
        
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay
        self.lr_minimum = lr_minimum
        self.exp_noise_base = exp_noise_base
        self.exp_noise_decay = exp_noise_decay     
        self.exp_noise_minimum = exp_noise_minimum
        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.gamma = gamma
        self.batch_size = batch_size        
        self.polyak = polyak        
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.max_buffer_length = max_buffer_length
        self.log_interval = log_interval
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
        
        self.policy = DDPG(self.actor_config, self.critic_config, self.action_dim, self.action_low, self.action_high, 
                           self.exploration_mu, exploration_theta, exploration_sigma)   
        self.replay_buffer = ReplayBuffer(max_length=self.max_buffer_length)
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
            
    def train(self):
        
        start_time = time.time()
        print("Training started ... \n")
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={}".format(self.threshold)) 
        print("action_low={} action_high={} \n".format(self.action_low, self.action_high))    

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")
        
        avg_actor_loss = 0.0
        avg_critic_loss = 0.0
        
        # training procedure:        
        for episode in range(self.max_episodes):
            
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
            
            ep_reward = 0.0        
            state = self.env.reset()
                       
            # calculate params
            noise_coeff = max(self.exp_noise_base / (1.0 + episode * self.exp_noise_decay), self.exp_noise_minimum)
            learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum)      
            self.policy.set_optimizers(lr=learning_rate)
           
            for t in range(self.max_timesteps):
                
                action = self.policy.select_action(state, noise_coeff)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.add((state, action, reward, next_state, float(done)))
                
                # Updating policy
                self.policy.update(self.replay_buffer, self.batch_size, self.gamma, self.polyak)
                
                state = next_state               
                ep_reward += reward            
                
                if done:
                    break

            self.reward_history.append(ep_reward)
            avg_reward = np.mean(self.reward_history[-100:]) 
           
            
            # logging updates:        
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
           
            if len(self.policy.actor_loss_list) > 0:               
                avg_actor_loss = np.mean(self.policy.actor_loss_list[-100:])
                avg_critic_loss = np.mean(self.policy.critic_loss_list[-100:])           
            
            if not self.make_plots and len(self.policy.actor_loss_list) > 200:
                self.policy.actor_loss_list.pop(0)
                self.policy.critic_loss_list.pop(0)               
                self.reward_history.pop(0)                

            # Print avg reward every log interval:
            if episode % self.log_interval == 0:            
                self.policy.save(self.directory, self.filename)
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    noise_coeff, avg_actor_loss, avg_critic_loss))
                        
            self.should_record = False
            
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold and episode > 100: 
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EN:{:0.4f}  Loss: {:5.3f} {:5.3f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    noise_coeff, avg_actor_loss, avg_critic_loss))
                print("########## Solved! ###########")
                name = self.filename + '_solved'
                self.policy.save(self.directory, name)
                log_f.close()
                self.env.close()  
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break    
                                
    def test(self, episodes=3, render=True, save_gif=True):              

        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)
        
        for episode in range(1, episodes+1):
            ep_reward = 0.0
            state = self.env.reset()    
            epdir = mkdir(algdir, str(episode))
                       
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state, 0)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.add((state, action, reward, next_state, float(done)))
                state = next_state               
                ep_reward += reward                                  
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    
            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
                   
            

In [5]:
agent = DDPGTrainer(env_name, actor_config, critic_config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   exp_noise_base=exp_noise_base, exp_noise_decay=exp_noise_decay, gamma=gamma, batch_size=batch_size,
                   polyak=polyak, max_episodes=max_episodes, max_timesteps=max_timesteps, 
                   max_buffer_length=max_buffer_length, log_interval=log_interval)
agent.train()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=4, bias=True)
  (5): Sigmoid()
)
ACTOR=Sequential(
  (0): Linear(in_features=24, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=4, bias=True)
  (5): Sigmoid()
)
CRITIC=Sequential(
  (0): Linear(in_features=28, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=1, bias=True)
)
CRITIC=Sequential(
  (0): Linear(in_features=28, out_features=64, bias=True)
  (1): Re

Ep:  730  Rew: -100.28  Avg Rew: -106.37  LR:0.00093197  Bf: 2  EN:0.2033  Loss: -23.060 8.886
Ep:  740  Rew: -101.48  Avg Rew: -101.92  LR:0.00093110  Bf: 2  EN:0.2016  Loss: -22.771 9.178
Ep:  750  Rew:  -99.37  Avg Rew: -101.81  LR:0.00093023  Bf: 2  EN:0.2000  Loss: -22.774 10.606
Ep:  760  Rew: -102.94  Avg Rew: -102.26  LR:0.00092937  Bf: 2  EN:0.1984  Loss: -22.531 10.199
Ep:  770  Rew: -115.94  Avg Rew: -102.26  LR:0.00092851  Bf: 2  EN:0.1969  Loss: -22.776 10.416
Ep:  780  Rew:  -97.16  Avg Rew: -102.93  LR:0.00092764  Bf: 2  EN:0.1953  Loss: -22.762 10.475
Ep:  790  Rew: -106.71  Avg Rew: -103.50  LR:0.00092678  Bf: 2  EN:0.1938  Loss: -22.764 10.093
Ep:  800  Rew: -105.85  Avg Rew: -103.24  LR:0.00092593  Bf: 2  EN:0.1923  Loss: -22.489 10.496
Ep:  810  Rew:  -99.84  Avg Rew: -104.26  LR:0.00092507  Bf: 2  EN:0.1908  Loss: -22.387 11.094
Ep:  820  Rew: -107.27  Avg Rew: -105.83  LR:0.00092421  Bf: 2  EN:0.1894  Loss: -22.088 10.646
Ep:  830  Rew: -120.17  Avg Rew: -107.02  

Ep: 1590  Rew:  169.74  Avg Rew:  -38.82  LR:0.00086281  Bf:12  EN:0.1196  Loss: -36.960 5.957
Ep: 1600  Rew:  -32.41  Avg Rew:   39.84  LR:0.00086207  Bf:12  EN:0.1190  Loss: -36.292 5.830
Ep: 1610  Rew:  -72.82  Avg Rew:   26.29  LR:0.00086133  Bf:12  EN:0.1185  Loss: -36.063 5.793
Ep: 1620  Rew:  -76.42  Avg Rew:  -46.97  LR:0.00086059  Bf:12  EN:0.1179  Loss: -35.847 6.859
Ep: 1630  Rew: -103.69  Avg Rew:  -43.75  LR:0.00085985  Bf:12  EN:0.1174  Loss: -35.184 6.516
Ep: 1640  Rew:  -54.81  Avg Rew:  -50.33  LR:0.00085911  Bf:13  EN:0.1168  Loss: -34.670 6.263
Ep: 1650  Rew:   98.10  Avg Rew:  -19.84  LR:0.00085837  Bf:13  EN:0.1163  Loss: -33.856 5.867
Ep: 1660  Rew:   67.07  Avg Rew:  -31.37  LR:0.00085763  Bf:13  EN:0.1157  Loss: -33.030 6.612
Ep: 1670  Rew:  -54.55  Avg Rew:  -65.35  LR:0.00085690  Bf:13  EN:0.1152  Loss: -32.397 5.212
Ep: 1680  Rew:  -74.06  Avg Rew:  -91.51  LR:0.00085616  Bf:13  EN:0.1147  Loss: -32.045 5.286
Ep: 1690  Rew:   56.94  Avg Rew:  -44.08  LR:0.000

Ep: 2460  Rew: -133.70  Avg Rew: -115.24  LR:0.00080257  Bf:22  EN:0.0845  Loss: -41.645 7.083
Ep: 2470  Rew: -125.67  Avg Rew: -137.83  LR:0.00080192  Bf:22  EN:0.0842  Loss: -40.913 6.168
Ep: 2480  Rew: -139.39  Avg Rew: -123.33  LR:0.00080128  Bf:23  EN:0.0839  Loss: -39.959 6.819
Ep: 2490  Rew:  174.41  Avg Rew: -105.87  LR:0.00080064  Bf:23  EN:0.0836  Loss: -38.348 6.758
Ep: 2500  Rew:  230.81  Avg Rew:  -38.49  LR:0.00080000  Bf:23  EN:0.0833  Loss: -37.206 6.428
Ep: 2510  Rew:  226.79  Avg Rew:   69.89  LR:0.00079936  Bf:23  EN:0.0831  Loss: -36.488 6.829
Ep: 2520  Rew:  -37.62  Avg Rew:  102.42  LR:0.00079872  Bf:23  EN:0.0828  Loss: -35.987 5.963
Ep: 2530  Rew: -100.76  Avg Rew:   41.93  LR:0.00079808  Bf:24  EN:0.0825  Loss: -35.643 5.394
Ep: 2540  Rew: -112.71  Avg Rew:   -0.23  LR:0.00079745  Bf:24  EN:0.0822  Loss: -35.182 6.156
Ep: 2550  Rew:  -97.23  Avg Rew:  -48.72  LR:0.00079681  Bf:24  EN:0.0820  Loss: -34.969 6.050
Ep: 2560  Rew:  -18.68  Avg Rew:  -37.75  LR:0.000

KeyboardInterrupt: 

In [None]:
agent.test()