In [1]:
import gym
import os
import time
import torch
import numpy as np
from gym import wrappers
from PIL import Image
from itertools import count
from collections import namedtuple

from DDQN.ddqn import DDQN
from DDQN.utils import ReplayBuffer, mkdir

In [2]:
env_name = 'MountainCar-v0'
lr_base = 0.001
lr_decay = 0.0001
epsilon_base = 0.99 
epsilon_decay = 0.002

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 256         # num of transitions sampled from replay buffer
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
log_interval = 10           # print avg reward after interval
#threshold = -100

In [3]:
fc_config = [
        {'dim': [None, 128], 'dropout': False, 'activation': 'relu'},
        {'dim': [128, None], 'dropout': False, 'activation': False}      
    ] 

In [4]:
class DDQNTrainer():
    
    def __init__(self, env_name, fc_config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 epsilon_base=0.3, epsilon_decay=0.0001, gamma=0.99, batch_size=1024, 
                 max_episodes=100000, max_timesteps=3000, max_buffer_length=5000000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, epsilon_minimum=1e-10,
                 record_videos=True, record_interval=100):        
        
        self.algorithm_name = 'ddqn'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n    
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
              
        self.fc_config = fc_config
        self.fc_config[0]['dim'][0] = self.state_dim
        self.fc_config[-1]['dim'][1] = self.action_dim      
        
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay
        self.lr_minimum = lr_minimum  
        self.epsilon_base = epsilon_base
        self.epsilon_decay = epsilon_decay
        self.epsilon_minimum = epsilon_minimum
        self.gamma = gamma
        self.batch_size = batch_size              
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.max_buffer_length = max_buffer_length
        self.log_interval = log_interval
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
        
        self.policy = DDQN(self.env, fc_config)   
        self.replay_buffer = ReplayBuffer(size=self.max_buffer_length)
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
            
    def train(self):
        
        start_time = time.time()        
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={} \n".format(self.threshold))        

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")
        
        print("\nTraining started ... ")
                
        # training procedure:        
        for episode in range(self.max_episodes):
            
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
            
            ep_reward = 0.0        
            state = self.env.reset()
                       
            # calculate params
            epsilon = max(self.epsilon_base / (1.0 + episode * self.epsilon_decay), self.epsilon_minimum)
            learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum)      
            self.policy.set_optimizers(lr=learning_rate)
           
            for t in range(self.max_timesteps):
                
                action = self.policy.select_action(state, epsilon)                
                next_state, reward, done, _ = self.env.step(action)          
                self.replay_buffer.add(state, action, reward, next_state, float(done))
                
                # Updating policy
                self.policy.update(self.replay_buffer, t, self.batch_size, self.gamma)
                
                state = next_state               
                ep_reward += reward            
                
                if done:
                    break
           
            self.reward_history.append(ep_reward)
            avg_reward = np.mean(self.reward_history[-100:]) 
                       
            # logging updates:        
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
           
            if len(self.policy.Q_loss_list) > 0:               
                avg_Q_loss = np.mean(self.policy.Q_loss_list[-100:])     
            
            # Truncate training history if we don't plan to plot it later
            if not self.make_plots:
                self.policy.truncate_loss_lists() 
                if len(self.reward_history) > 100:
                    self.reward_history.pop(0)    
         
            # Print avg reward every log interval:
            if episode % self.log_interval == 0:            
                self.policy.save(self.directory, self.filename)
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EPS:{:0.4f}  Loss: {:8.6f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    epsilon, avg_Q_loss))
                        
            self.should_record = False
            
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold and episode > 100: 
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EPS:{:0.4f}  Loss: {:8.6f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    epsilon, avg_Q_loss))
                print("########## Solved! ###########")
                name = self.filename + '_solved'
                self.policy.save(self.directory, name)
                log_f.close()
                self.env.close()  
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break    
                                
    def test(self, episodes=3, render=True, save_gif=True):              

        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)
        
        for episode in range(1, episodes+1):
            ep_reward = 0.0
            state = self.env.reset()    
            epdir = mkdir(algdir, str(episode))
                       
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state, 0)
                next_state, reward, done, _ = self.env.step(action)               
                state = next_state               
                ep_reward += reward                                  
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    
            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
                   
            

In [5]:
agent = DDQNTrainer(env_name, fc_config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   epsilon_base=epsilon_base, epsilon_decay=epsilon_decay, gamma=gamma, batch_size=batch_size,
                   max_episodes=max_episodes, max_timesteps=max_timesteps, 
                   max_buffer_length=max_buffer_length, log_interval=log_interval)
agent.train()

NETWORK=Sequential(
  (0): Linear(in_features=2, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=3, bias=True)
)
NETWORK=Sequential(
  (0): Linear(in_features=2, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=3, bias=True)
)
Random Seed: 42
action_space=Discrete(3)
obs_space=Box(2,)
threshold=-110.0 

DIR=./preTrained/ddqn NAME=ddqn_MountainCar-v0_42
No models to load

Training started ... 
Ep:    0  Rew: -200.00  Avg Rew: -200.00  LR:0.00100000  Bf: 0  EPS:0.9900  Loss: 0.004990
Ep:   10  Rew: -200.00  Avg Rew: -200.00  LR:0.00099900  Bf: 0  EPS:0.9706  Loss: 0.386042
Ep:   20  Rew: -200.00  Avg Rew: -200.00  LR:0.00099800  Bf: 0  EPS:0.9519  Loss: 1.618121
Ep:   30  Rew: -200.00  Avg Rew: -200.00  LR:0.00099701  Bf: 0  EPS:0.9340  Loss: 2.502021
Ep:   40  Rew: -200.00  Avg Rew: -200.00  LR:0.00099602  Bf: 0  EPS:0.9167  Loss: 4.079654
Ep:   50  Rew: -200.00  Avg Rew: -200.00  LR:0.00099502  Bf: 0  EPS:0.90

Ep:  850  Rew: -200.00  Avg Rew: -197.75  LR:0.00092166  Bf: 3  EPS:0.3667  Loss: 14.097424
Ep:  860  Rew: -200.00  Avg Rew: -197.44  LR:0.00092081  Bf: 3  EPS:0.3640  Loss: 9.850342
Ep:  870  Rew: -200.00  Avg Rew: -197.40  LR:0.00091996  Bf: 3  EPS:0.3613  Loss: 13.850280
Ep:  880  Rew: -200.00  Avg Rew: -197.40  LR:0.00091912  Bf: 4  EPS:0.3587  Loss: 11.951181
Ep:  890  Rew: -200.00  Avg Rew: -197.66  LR:0.00091827  Bf: 4  EPS:0.3561  Loss: 10.790168
Ep:  900  Rew: -200.00  Avg Rew: -197.66  LR:0.00091743  Bf: 4  EPS:0.3536  Loss: 10.599971
Ep:  910  Rew: -200.00  Avg Rew: -198.08  LR:0.00091659  Bf: 4  EPS:0.3511  Loss: 11.252009
Ep:  920  Rew: -200.00  Avg Rew: -198.60  LR:0.00091575  Bf: 4  EPS:0.3486  Loss: 10.095059
Ep:  930  Rew: -200.00  Avg Rew: -198.60  LR:0.00091491  Bf: 4  EPS:0.3462  Loss: 12.030513
Ep:  940  Rew: -200.00  Avg Rew: -199.19  LR:0.00091408  Bf: 4  EPS:0.3438  Loss: 13.307950
Ep:  950  Rew: -196.00  Avg Rew: -199.53  LR:0.00091324  Bf: 4  EPS:0.3414  Loss:

Ep: 1750  Rew: -156.00  Avg Rew: -153.66  LR:0.00085106  Bf: 7  EPS:0.2200  Loss: 6.940595
Ep: 1760  Rew: -150.00  Avg Rew: -153.53  LR:0.00085034  Bf: 7  EPS:0.2190  Loss: 7.350378
Ep: 1770  Rew: -123.00  Avg Rew: -152.42  LR:0.00084962  Bf: 7  EPS:0.2181  Loss: 7.516351
Ep: 1780  Rew:  -99.00  Avg Rew: -151.85  LR:0.00084890  Bf: 7  EPS:0.2171  Loss: 7.627375
Ep: 1790  Rew: -168.00  Avg Rew: -151.27  LR:0.00084818  Bf: 7  EPS:0.2162  Loss: 6.712422
Ep: 1800  Rew: -154.00  Avg Rew: -150.78  LR:0.00084746  Bf: 7  EPS:0.2152  Loss: 6.863842
Ep: 1810  Rew: -200.00  Avg Rew: -151.35  LR:0.00084674  Bf: 7  EPS:0.2143  Loss: 6.350220
Ep: 1820  Rew: -163.00  Avg Rew: -151.30  LR:0.00084602  Bf: 7  EPS:0.2134  Loss: 8.892276
Ep: 1830  Rew: -115.00  Avg Rew: -150.91  LR:0.00084531  Bf: 7  EPS:0.2124  Loss: 6.871855
Ep: 1840  Rew: -156.00  Avg Rew: -151.15  LR:0.00084459  Bf: 7  EPS:0.2115  Loss: 6.577690
Ep: 1850  Rew: -163.00  Avg Rew: -151.47  LR:0.00084388  Bf: 7  EPS:0.2106  Loss: 8.298789

Ep: 2660  Rew: -172.00  Avg Rew: -152.87  LR:0.00078989  Bf: 9  EPS:0.1566  Loss: 5.715381
Ep: 2670  Rew: -145.00  Avg Rew: -151.56  LR:0.00078927  Bf: 9  EPS:0.1562  Loss: 6.591532
Ep: 2680  Rew: -151.00  Avg Rew: -152.71  LR:0.00078864  Bf: 9  EPS:0.1557  Loss: 4.801647
Ep: 2690  Rew: -155.00  Avg Rew: -152.43  LR:0.00078802  Bf: 9  EPS:0.1552  Loss: 5.833578
Ep: 2700  Rew:  -91.00  Avg Rew: -151.32  LR:0.00078740  Bf: 9  EPS:0.1547  Loss: 4.971373
Ep: 2710  Rew: -155.00  Avg Rew: -150.19  LR:0.00078678  Bf: 9  EPS:0.1542  Loss: 5.647870
Ep: 2720  Rew: -115.00  Avg Rew: -149.45  LR:0.00078616  Bf: 9  EPS:0.1537  Loss: 5.836919
Ep: 2730  Rew: -154.00  Avg Rew: -149.16  LR:0.00078555  Bf: 9  EPS:0.1533  Loss: 7.179959
Ep: 2740  Rew: -117.00  Avg Rew: -148.05  LR:0.00078493  Bf:10  EPS:0.1528  Loss: 6.935260
Ep: 2750  Rew: -153.00  Avg Rew: -147.79  LR:0.00078431  Bf:10  EPS:0.1523  Loss: 4.484557
Ep: 2760  Rew: -147.00  Avg Rew: -146.99  LR:0.00078370  Bf:10  EPS:0.1518  Loss: 6.329120

Ep: 3570  Rew: -151.00  Avg Rew: -155.22  LR:0.00073692  Bf:12  EPS:0.1216  Loss: 5.197507
Ep: 3580  Rew: -155.00  Avg Rew: -155.57  LR:0.00073638  Bf:12  EPS:0.1213  Loss: 6.459891
Ep: 3590  Rew: -173.00  Avg Rew: -154.72  LR:0.00073584  Bf:12  EPS:0.1210  Loss: 6.301840
Ep: 3600  Rew: -149.00  Avg Rew: -153.57  LR:0.00073529  Bf:12  EPS:0.1207  Loss: 4.899354
Ep: 3610  Rew: -154.00  Avg Rew: -153.79  LR:0.00073475  Bf:12  EPS:0.1204  Loss: 6.400603
Ep: 3620  Rew: -148.00  Avg Rew: -152.95  LR:0.00073421  Bf:12  EPS:0.1201  Loss: 5.572542
Ep: 3630  Rew:  -99.00  Avg Rew: -151.62  LR:0.00073368  Bf:12  EPS:0.1199  Loss: 5.193350
Ep: 3640  Rew: -200.00  Avg Rew: -152.02  LR:0.00073314  Bf:12  EPS:0.1196  Loss: 4.528398
Ep: 3650  Rew: -158.00  Avg Rew: -150.29  LR:0.00073260  Bf:12  EPS:0.1193  Loss: 6.472761
Ep: 3660  Rew: -111.00  Avg Rew: -149.85  LR:0.00073206  Bf:12  EPS:0.1190  Loss: 4.998066
Ep: 3670  Rew: -153.00  Avg Rew: -150.96  LR:0.00073153  Bf:12  EPS:0.1187  Loss: 5.475140

Ep: 4480  Rew: -140.00  Avg Rew: -149.27  LR:0.00069061  Bf:15  EPS:0.0994  Loss: 4.909371
Ep: 4490  Rew: -142.00  Avg Rew: -148.04  LR:0.00069013  Bf:15  EPS:0.0992  Loss: 4.519976
Ep: 4500  Rew: -136.00  Avg Rew: -146.82  LR:0.00068966  Bf:15  EPS:0.0990  Loss: 4.598806
Ep: 4510  Rew: -146.00  Avg Rew: -146.56  LR:0.00068918  Bf:15  EPS:0.0988  Loss: 4.746810
Ep: 4520  Rew: -142.00  Avg Rew: -146.31  LR:0.00068871  Bf:15  EPS:0.0986  Loss: 4.915435
Ep: 4530  Rew: -153.00  Avg Rew: -147.98  LR:0.00068823  Bf:15  EPS:0.0984  Loss: 3.364279
Ep: 4540  Rew: -142.00  Avg Rew: -148.06  LR:0.00068776  Bf:15  EPS:0.0982  Loss: 4.239001
Ep: 4550  Rew: -149.00  Avg Rew: -147.18  LR:0.00068729  Bf:15  EPS:0.0980  Loss: 4.378747
Ep: 4560  Rew: -137.00  Avg Rew: -147.09  LR:0.00068681  Bf:15  EPS:0.0978  Loss: 3.453850
Ep: 4570  Rew: -157.00  Avg Rew: -146.07  LR:0.00068634  Bf:15  EPS:0.0976  Loss: 3.995028
Ep: 4580  Rew: -154.00  Avg Rew: -143.65  LR:0.00068587  Bf:15  EPS:0.0974  Loss: 4.042294

Ep: 5390  Rew: -147.00  Avg Rew: -149.29  LR:0.00064977  Bf:17  EPS:0.0840  Loss: 3.433930
Ep: 5400  Rew: -143.00  Avg Rew: -149.78  LR:0.00064935  Bf:17  EPS:0.0839  Loss: 3.870317
Ep: 5410  Rew: -148.00  Avg Rew: -149.50  LR:0.00064893  Bf:17  EPS:0.0838  Loss: 3.867073
Ep: 5420  Rew: -194.00  Avg Rew: -150.82  LR:0.00064851  Bf:18  EPS:0.0836  Loss: 3.536328
Ep: 5430  Rew: -198.00  Avg Rew: -152.63  LR:0.00064809  Bf:18  EPS:0.0835  Loss: 4.933320
Ep: 5440  Rew: -145.00  Avg Rew: -150.91  LR:0.00064767  Bf:18  EPS:0.0833  Loss: 3.835159
Ep: 5450  Rew: -153.00  Avg Rew: -148.67  LR:0.00064725  Bf:18  EPS:0.0832  Loss: 3.951261
Ep: 5460  Rew: -194.00  Avg Rew: -149.45  LR:0.00064683  Bf:18  EPS:0.0831  Loss: 4.411501
Ep: 5470  Rew: -181.00  Avg Rew: -149.32  LR:0.00064641  Bf:18  EPS:0.0829  Loss: 4.068730
Ep: 5480  Rew: -169.00  Avg Rew: -150.56  LR:0.00064599  Bf:18  EPS:0.0828  Loss: 3.743563
Ep: 5490  Rew:  -96.00  Avg Rew: -150.37  LR:0.00064558  Bf:18  EPS:0.0826  Loss: 3.968714

Ep: 6300  Rew: -147.00  Avg Rew: -152.67  LR:0.00061350  Bf:20  EPS:0.0728  Loss: 3.971530
Ep: 6310  Rew: -180.00  Avg Rew: -152.23  LR:0.00061312  Bf:20  EPS:0.0727  Loss: 4.409852
Ep: 6320  Rew: -146.00  Avg Rew: -151.67  LR:0.00061275  Bf:20  EPS:0.0726  Loss: 4.177970
Ep: 6330  Rew: -174.00  Avg Rew: -152.22  LR:0.00061237  Bf:20  EPS:0.0725  Loss: 4.114833
Ep: 6340  Rew: -168.00  Avg Rew: -149.73  LR:0.00061200  Bf:20  EPS:0.0724  Loss: 2.744611
Ep: 6350  Rew: -102.00  Avg Rew: -150.66  LR:0.00061162  Bf:20  EPS:0.0723  Loss: 3.746577
Ep: 6360  Rew: -168.00  Avg Rew: -150.92  LR:0.00061125  Bf:20  EPS:0.0722  Loss: 4.655162
Ep: 6370  Rew: -151.00  Avg Rew: -150.85  LR:0.00061087  Bf:20  EPS:0.0721  Loss: 3.649405
Ep: 6380  Rew:  -99.00  Avg Rew: -150.09  LR:0.00061050  Bf:20  EPS:0.0719  Loss: 3.030978
Ep: 6390  Rew: -163.00  Avg Rew: -147.98  LR:0.00061013  Bf:20  EPS:0.0718  Loss: 3.144775
Ep: 6400  Rew: -137.00  Avg Rew: -146.61  LR:0.00060976  Bf:20  EPS:0.0717  Loss: 3.649594

Ep: 7210  Rew: -151.00  Avg Rew: -147.70  LR:0.00058106  Bf:23  EPS:0.0642  Loss: 3.717261
Ep: 7220  Rew: -162.00  Avg Rew: -146.19  LR:0.00058072  Bf:23  EPS:0.0641  Loss: 4.302967
Ep: 7230  Rew: -175.00  Avg Rew: -146.27  LR:0.00058038  Bf:23  EPS:0.0640  Loss: 2.668374
Ep: 7240  Rew: -100.00  Avg Rew: -145.73  LR:0.00058005  Bf:23  EPS:0.0640  Loss: 3.505320
Ep: 7250  Rew:  -88.00  Avg Rew: -144.56  LR:0.00057971  Bf:23  EPS:0.0639  Loss: 4.254208
Ep: 7260  Rew: -157.00  Avg Rew: -143.50  LR:0.00057937  Bf:23  EPS:0.0638  Loss: 3.964853
Ep: 7270  Rew: -152.00  Avg Rew: -143.84  LR:0.00057904  Bf:23  EPS:0.0637  Loss: 4.236195
Ep: 7280  Rew: -165.00  Avg Rew: -143.32  LR:0.00057870  Bf:23  EPS:0.0636  Loss: 4.306568
Ep: 7290  Rew: -154.00  Avg Rew: -143.65  LR:0.00057837  Bf:23  EPS:0.0635  Loss: 3.488180
Ep: 7300  Rew: -111.00  Avg Rew: -144.84  LR:0.00057803  Bf:23  EPS:0.0635  Loss: 4.553766
Ep: 7310  Rew: -161.00  Avg Rew: -144.33  LR:0.00057770  Bf:23  EPS:0.0634  Loss: 2.773240

Ep: 8120  Rew: -105.00  Avg Rew: -138.09  LR:0.00055188  Bf:25  EPS:0.0574  Loss: 3.539488
Ep: 8130  Rew: -181.00  Avg Rew: -138.79  LR:0.00055157  Bf:26  EPS:0.0574  Loss: 3.242514
Ep: 8140  Rew: -168.00  Avg Rew: -140.96  LR:0.00055127  Bf:26  EPS:0.0573  Loss: 4.155815
Ep: 8150  Rew: -190.00  Avg Rew: -144.50  LR:0.00055096  Bf:26  EPS:0.0572  Loss: 4.023796
Ep: 8160  Rew: -158.00  Avg Rew: -144.03  LR:0.00055066  Bf:26  EPS:0.0572  Loss: 3.581284
Ep: 8170  Rew: -105.00  Avg Rew: -142.52  LR:0.00055036  Bf:26  EPS:0.0571  Loss: 3.088435
Ep: 8180  Rew: -187.00  Avg Rew: -139.87  LR:0.00055006  Bf:26  EPS:0.0570  Loss: 3.546952
Ep: 8190  Rew: -119.00  Avg Rew: -137.00  LR:0.00054975  Bf:26  EPS:0.0570  Loss: 3.574116
Ep: 8200  Rew: -174.00  Avg Rew: -137.76  LR:0.00054945  Bf:26  EPS:0.0569  Loss: 3.401917
Ep: 8210  Rew: -159.00  Avg Rew: -136.79  LR:0.00054915  Bf:26  EPS:0.0568  Loss: 2.874394
Ep: 8220  Rew: -144.00  Avg Rew: -135.96  LR:0.00054885  Bf:26  EPS:0.0568  Loss: 2.827835

Ep: 9030  Rew: -152.00  Avg Rew: -155.57  LR:0.00052549  Bf:28  EPS:0.0519  Loss: 4.035224
Ep: 9040  Rew: -148.00  Avg Rew: -155.15  LR:0.00052521  Bf:28  EPS:0.0519  Loss: 2.565146
Ep: 9050  Rew: -200.00  Avg Rew: -154.33  LR:0.00052493  Bf:28  EPS:0.0518  Loss: 3.026160
Ep: 9060  Rew: -149.00  Avg Rew: -154.11  LR:0.00052466  Bf:28  EPS:0.0518  Loss: 2.971177
Ep: 9070  Rew: -160.00  Avg Rew: -153.78  LR:0.00052438  Bf:28  EPS:0.0517  Loss: 3.429993
Ep: 9080  Rew: -148.00  Avg Rew: -155.65  LR:0.00052411  Bf:28  EPS:0.0517  Loss: 3.166445
Ep: 9090  Rew: -160.00  Avg Rew: -154.28  LR:0.00052383  Bf:28  EPS:0.0516  Loss: 2.564982
Ep: 9100  Rew: -156.00  Avg Rew: -155.05  LR:0.00052356  Bf:28  EPS:0.0516  Loss: 2.674939
Ep: 9110  Rew: -149.00  Avg Rew: -155.33  LR:0.00052329  Bf:28  EPS:0.0515  Loss: 2.713862
Ep: 9120  Rew: -147.00  Avg Rew: -154.42  LR:0.00052301  Bf:29  EPS:0.0515  Loss: 3.451372
Ep: 9130  Rew: -148.00  Avg Rew: -154.47  LR:0.00052274  Bf:29  EPS:0.0514  Loss: 3.218285

Ep: 9940  Rew: -147.00  Avg Rew: -140.56  LR:0.00050150  Bf:31  EPS:0.0474  Loss: 1.916162
Ep: 9950  Rew: -148.00  Avg Rew: -137.66  LR:0.00050125  Bf:31  EPS:0.0474  Loss: 2.278906
Ep: 9960  Rew: -114.00  Avg Rew: -137.30  LR:0.00050100  Bf:31  EPS:0.0473  Loss: 2.430854
Ep: 9970  Rew: -153.00  Avg Rew: -136.74  LR:0.00050075  Bf:31  EPS:0.0473  Loss: 1.893075
Ep: 9980  Rew: -148.00  Avg Rew: -136.52  LR:0.00050050  Bf:31  EPS:0.0472  Loss: 2.436889
Ep: 9990  Rew: -115.00  Avg Rew: -134.92  LR:0.00050025  Bf:31  EPS:0.0472  Loss: 1.916419
Ep:10000  Rew: -124.00  Avg Rew: -134.70  LR:0.00050000  Bf:31  EPS:0.0471  Loss: 2.940400
Ep:10010  Rew: -152.00  Avg Rew: -132.63  LR:0.00049975  Bf:31  EPS:0.0471  Loss: 2.922482
Ep:10020  Rew: -159.00  Avg Rew: -132.02  LR:0.00049950  Bf:31  EPS:0.0471  Loss: 2.626347
Ep:10030  Rew: -153.00  Avg Rew: -131.88  LR:0.00049925  Bf:31  EPS:0.0470  Loss: 2.919867
Ep:10040  Rew: -112.00  Avg Rew: -131.20  LR:0.00049900  Bf:31  EPS:0.0470  Loss: 2.386424

Ep:10850  Rew: -118.00  Avg Rew: -141.99  LR:0.00047962  Bf:34  EPS:0.0436  Loss: 2.394882
Ep:10860  Rew: -165.00  Avg Rew: -140.09  LR:0.00047939  Bf:34  EPS:0.0436  Loss: 2.890712
Ep:10870  Rew: -115.00  Avg Rew: -138.20  LR:0.00047916  Bf:34  EPS:0.0435  Loss: 3.243202
Ep:10880  Rew: -116.00  Avg Rew: -137.47  LR:0.00047893  Bf:34  EPS:0.0435  Loss: 1.704209
Ep:10890  Rew: -117.00  Avg Rew: -135.60  LR:0.00047870  Bf:34  EPS:0.0435  Loss: 1.944801
Ep:10900  Rew: -194.00  Avg Rew: -134.25  LR:0.00047847  Bf:34  EPS:0.0434  Loss: 2.867929
Ep:10910  Rew: -114.00  Avg Rew: -132.36  LR:0.00047824  Bf:34  EPS:0.0434  Loss: 2.537423
Ep:10920  Rew: -119.00  Avg Rew: -129.67  LR:0.00047801  Bf:34  EPS:0.0433  Loss: 2.123445
Ep:10930  Rew: -144.00  Avg Rew: -126.51  LR:0.00047778  Bf:34  EPS:0.0433  Loss: 3.130141
Ep:10940  Rew: -114.00  Avg Rew: -127.36  LR:0.00047755  Bf:34  EPS:0.0433  Loss: 2.918728
Ep:10950  Rew: -145.00  Avg Rew: -128.70  LR:0.00047733  Bf:34  EPS:0.0432  Loss: 1.518026

Ep:11760  Rew: -154.00  Avg Rew: -121.41  LR:0.00045956  Bf:36  EPS:0.0404  Loss: 2.177112
Ep:11770  Rew: -113.00  Avg Rew: -122.47  LR:0.00045935  Bf:36  EPS:0.0403  Loss: 1.700738
Ep:11780  Rew: -117.00  Avg Rew: -121.75  LR:0.00045914  Bf:36  EPS:0.0403  Loss: 2.056181
Ep:11790  Rew: -158.00  Avg Rew: -123.48  LR:0.00045893  Bf:36  EPS:0.0403  Loss: 2.571173
Ep:11800  Rew: -171.00  Avg Rew: -125.07  LR:0.00045872  Bf:36  EPS:0.0402  Loss: 2.629538
Ep:11810  Rew: -149.00  Avg Rew: -126.75  LR:0.00045851  Bf:36  EPS:0.0402  Loss: 2.369581
Ep:11820  Rew: -156.00  Avg Rew: -127.28  LR:0.00045830  Bf:36  EPS:0.0402  Loss: 2.343340
Ep:11830  Rew: -151.00  Avg Rew: -127.21  LR:0.00045809  Bf:36  EPS:0.0401  Loss: 1.786904
Ep:11840  Rew:  -89.00  Avg Rew: -127.57  LR:0.00045788  Bf:36  EPS:0.0401  Loss: 1.733284
Ep:11850  Rew: -169.00  Avg Rew: -127.94  LR:0.00045767  Bf:36  EPS:0.0401  Loss: 1.975382
Ep:11860  Rew: -119.00  Avg Rew: -127.28  LR:0.00045746  Bf:36  EPS:0.0400  Loss: 1.772088

Ep:12670  Rew: -106.00  Avg Rew: -116.56  LR:0.00044111  Bf:38  EPS:0.0376  Loss: 1.740695
Ep:12680  Rew: -106.00  Avg Rew: -115.65  LR:0.00044092  Bf:38  EPS:0.0376  Loss: 1.859054
Ep:12690  Rew: -109.00  Avg Rew: -117.07  LR:0.00044072  Bf:38  EPS:0.0375  Loss: 2.199795
Ep:12700  Rew: -109.00  Avg Rew: -116.25  LR:0.00044053  Bf:38  EPS:0.0375  Loss: 2.388537
Ep:12710  Rew: -166.00  Avg Rew: -115.91  LR:0.00044033  Bf:38  EPS:0.0375  Loss: 1.677276
Ep:12720  Rew: -108.00  Avg Rew: -116.14  LR:0.00044014  Bf:38  EPS:0.0374  Loss: 1.369292
Ep:12730  Rew:  -91.00  Avg Rew: -115.99  LR:0.00043995  Bf:38  EPS:0.0374  Loss: 1.791656
Ep:12740  Rew: -155.00  Avg Rew: -117.58  LR:0.00043975  Bf:38  EPS:0.0374  Loss: 1.231322
Ep:12750  Rew: -109.00  Avg Rew: -115.63  LR:0.00043956  Bf:38  EPS:0.0374  Loss: 1.875534
Ep:12760  Rew: -153.00  Avg Rew: -117.01  LR:0.00043937  Bf:38  EPS:0.0373  Loss: 2.021767
Ep:12770  Rew: -110.00  Avg Rew: -116.41  LR:0.00043917  Bf:38  EPS:0.0373  Loss: 2.008750

Ep:13580  Rew: -150.00  Avg Rew: -154.96  LR:0.00042409  Bf:41  EPS:0.0352  Loss: 1.695582
Ep:13590  Rew: -200.00  Avg Rew: -156.42  LR:0.00042391  Bf:41  EPS:0.0351  Loss: 1.758156


KeyboardInterrupt: 

In [None]:
agent.test()