In [1]:
import gym
import os
import time
import torch
import numpy as np
from gym import wrappers
from PIL import Image
from itertools import count
from collections import namedtuple

from DQN.dqn import DQN
from DQN.utils import ReplayBuffer, mkdir

In [2]:
env_name = 'MountainCar-v0'
lr_base = 0.001
lr_decay = 0.0001
epsilon_base = 0.99 
epsilon_decay = 0.002

random_seed = 42
gamma = 0.99                # discount for future rewards
batch_size = 256         # num of transitions sampled from replay buffer
max_episodes = 100000         # max num of episodes
max_timesteps = 3000        # max timesteps in one episode
max_buffer_length = 5000000
log_interval = 10           # print avg reward after interval
#threshold = -100

In [3]:
fc_config = [
        {'dim': [None, 128], 'dropout': False, 'activation': 'relu'},
        {'dim': [128, None], 'dropout': False, 'activation': False}      
    ] 

In [4]:
class DQNTrainer():
    
    def __init__(self, env_name, fc_config, random_seed=42, lr_base=0.001, lr_decay=0.00005, 
                 epsilon_base=0.3, epsilon_decay=0.0001, gamma=0.99, batch_size=1024, 
                 max_episodes=100000, max_timesteps=3000, max_buffer_length=5000000, 
                 log_interval=5, threshold=None, lr_minimum=1e-10, epsilon_minimum=1e-10,
                 record_videos=True, record_interval=100):        
        
        self.algorithm_name = 'dqn'
        self.env_name = env_name
        self.env = gym.make(env_name)
        self.record_videos = record_videos
        self.record_interval = record_interval        
        if self.record_videos == True:
            videos_dir = mkdir('.', 'videos')
            monitor_dir = mkdir(videos_dir, self.algorithm_name)
            should_record = lambda i: self.should_record
            self.env = wrappers.Monitor(self.env, monitor_dir, video_callable=should_record, force=True)            
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n    
        self.should_record = False
        if not threshold == None:
            self.threshold = threshold
        else:    
            self.threshold = self.env.spec.reward_threshold
              
        self.fc_config = fc_config
        self.fc_config[0]['dim'][0] = self.state_dim
        self.fc_config[-1]['dim'][1] = self.action_dim      
        
        self.random_seed = random_seed
        self.lr_base = lr_base
        self.lr_decay = lr_decay
        self.lr_minimum = lr_minimum  
        self.epsilon_base = epsilon_base
        self.epsilon_decay = epsilon_decay
        self.epsilon_minimum = epsilon_minimum
        self.gamma = gamma
        self.batch_size = batch_size              
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.max_buffer_length = max_buffer_length
        self.log_interval = log_interval
        
        prdir = mkdir('.', 'preTrained')
        self.directory = mkdir(prdir, self.algorithm_name)
        self.filename = "{}_{}_{}".format(self.algorithm_name, self.env_name, self.random_seed)
        
        self.policy = DQN(self.env, fc_config)   
        self.replay_buffer = ReplayBuffer(size=self.max_buffer_length)
        
        self.reward_history = []
        self.make_plots = False       
        
        if self.random_seed:
            print("Random Seed: {}".format(self.random_seed))
            self.env.seed(self.random_seed)
            torch.manual_seed(self.random_seed)
            np.random.seed(self.random_seed)
            
    def train(self):
        
        start_time = time.time()        
        print("action_space={}".format(self.env.action_space))
        print("obs_space={}".format(self.env.observation_space))
        print("threshold={} \n".format(self.threshold))        

        # loading models
        self.policy.load(self.directory, self.filename)
                
        # logging variables:        
        log_f = open("train_{}.txt".format(self.algorithm_name), "w+")
        
        print("\nTraining started ... ")
                
        # training procedure:        
        for episode in range(self.max_episodes):
            
            # Only record video during evaluation, every n steps
            if episode % self.record_interval == 0:
                self.should_record = True
            
            ep_reward = 0.0        
            state = self.env.reset()
                       
            # calculate params
            epsilon = max(self.epsilon_base / (1.0 + episode * self.epsilon_decay), self.epsilon_minimum)
            learning_rate = max(self.lr_base / (1.0 + episode * self.lr_decay), self.lr_minimum)      
            self.policy.set_optimizers(lr=learning_rate)
           
            for t in range(self.max_timesteps):
                
                action = self.policy.select_action(state, epsilon)                
                next_state, reward, done, _ = self.env.step(action)          
                self.replay_buffer.add(state, action, reward, next_state, float(done))
                
                # Updating policy
                self.policy.update(self.replay_buffer, t, self.batch_size, self.gamma)
                
                state = next_state               
                ep_reward += reward            
                
                if done:
                    break
           
            self.reward_history.append(ep_reward)
            avg_reward = np.mean(self.reward_history[-100:]) 
                       
            # logging updates:        
            log_f.write('{},{}\n'.format(episode, ep_reward))
            log_f.flush()
           
            if len(self.policy.Q_loss_list) > 0:               
                avg_Q_loss = np.mean(self.policy.Q_loss_list[-100:])     
            
            # Truncate training history if we don't plan to plot it later
            if not self.make_plots:
                self.policy.truncate_loss_lists() 
                if len(self.reward_history) > 100:
                    self.reward_history.pop(0)    
         
            # Print avg reward every log interval:
            if episode % self.log_interval == 0:            
                self.policy.save(self.directory, self.filename)
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EPS:{:0.4f}  Loss: {:8.6f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    epsilon, avg_Q_loss))
                        
            self.should_record = False
            
            # if avg reward > threshold then save and stop traning:
            if avg_reward >= self.threshold and episode > 100: 
                print("Ep:{:5d}  Rew:{:8.2f}  Avg Rew:{:8.2f}  LR:{:8.8f}  Bf:{:2.0f}  EPS:{:0.4f}  Loss: {:8.6f}".format(
                    episode, ep_reward, avg_reward, learning_rate, self.replay_buffer.get_fill(), 
                    epsilon, avg_Q_loss))
                print("########## Solved! ###########")
                name = self.filename + '_solved'
                self.policy.save(self.directory, name)
                log_f.close()
                self.env.close()  
                training_time = time.time() - start_time
                print("Training time: {:6.2f} sec".format(training_time))
                break    
                                
    def test(self, episodes=3, render=True, save_gif=True):              

        gifdir = mkdir('.','gif')
        algdir = mkdir(gifdir, self.algorithm_name)
        
        for episode in range(1, episodes+1):
            ep_reward = 0.0
            state = self.env.reset()    
            epdir = mkdir(algdir, str(episode))
                       
            for t in range(self.max_timesteps):
                action = self.policy.select_action(state, 0)
                next_state, reward, done, _ = self.env.step(action)               
                state = next_state               
                ep_reward += reward                                  
                
                if save_gif:                                       
                    img = self.env.render(mode = 'rgb_array')
                    img = Image.fromarray(img)
                    img.save('{}/{}.jpg'.format(epdir, t))
                if done:
                    break
                    
            print('Test episode: {}\tReward: {:4.2f}'.format(episode, ep_reward))           
            self.env.close()        
                   
            

In [5]:
agent = DQNTrainer(env_name, fc_config, random_seed=random_seed, lr_base=lr_base, lr_decay=lr_decay, 
                   epsilon_base=epsilon_base, epsilon_decay=epsilon_decay, gamma=gamma, batch_size=batch_size,
                   max_episodes=max_episodes, max_timesteps=max_timesteps, 
                   max_buffer_length=max_buffer_length, log_interval=log_interval)
agent.train()

NETWORK=Sequential(
  (0): Linear(in_features=2, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=3, bias=True)
)
NETWORK=Sequential(
  (0): Linear(in_features=2, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=3, bias=True)
)
Random Seed: 42
action_space=Discrete(3)
obs_space=Box(2,)
threshold=-110.0 

DIR=./preTrained/dqn NAME=dqn_MountainCar-v0_42
Models loaded

Training started ... 
Ep:    0  Rew: -200.00  Avg Rew: -200.00  LR:0.00100000  Bf: 0  EPS:0.9900  Loss: 0.033602
Ep:   10  Rew: -200.00  Avg Rew: -200.00  LR:0.00099900  Bf: 0  EPS:0.9706  Loss: 2.483715
Ep:   20  Rew: -200.00  Avg Rew: -200.00  LR:0.00099800  Bf: 0  EPS:0.9519  Loss: 3.669043
Ep:   30  Rew: -200.00  Avg Rew: -200.00  LR:0.00099701  Bf: 0  EPS:0.9340  Loss: 5.343717
Ep:   40  Rew: -200.00  Avg Rew: -200.00  LR:0.00099602  Bf: 0  EPS:0.9167  Loss: 6.320290
Ep:   50  Rew: -200.00  Avg Rew: -200.00  LR:0.00099502  Bf: 0  EPS:0.9000  Lo

Ep:  850  Rew: -159.00  Avg Rew: -184.56  LR:0.00092166  Bf: 3  EPS:0.3667  Loss: 9.035453
Ep:  860  Rew: -163.00  Avg Rew: -183.80  LR:0.00092081  Bf: 3  EPS:0.3640  Loss: 8.638682
Ep:  870  Rew: -169.00  Avg Rew: -182.78  LR:0.00091996  Bf: 3  EPS:0.3613  Loss: 9.786293
Ep:  880  Rew: -127.00  Avg Rew: -182.21  LR:0.00091912  Bf: 3  EPS:0.3587  Loss: 8.215250
Ep:  890  Rew: -158.00  Avg Rew: -180.52  LR:0.00091827  Bf: 4  EPS:0.3561  Loss: 8.058289
Ep:  900  Rew: -163.00  Avg Rew: -178.28  LR:0.00091743  Bf: 4  EPS:0.3536  Loss: 8.190472
Ep:  910  Rew: -157.00  Avg Rew: -177.87  LR:0.00091659  Bf: 4  EPS:0.3511  Loss: 9.384894
Ep:  920  Rew: -160.00  Avg Rew: -177.54  LR:0.00091575  Bf: 4  EPS:0.3486  Loss: 8.847934
Ep:  930  Rew: -158.00  Avg Rew: -176.15  LR:0.00091491  Bf: 4  EPS:0.3462  Loss: 8.508505
Ep:  940  Rew: -198.00  Avg Rew: -175.56  LR:0.00091408  Bf: 4  EPS:0.3438  Loss: 7.636589
Ep:  950  Rew: -128.00  Avg Rew: -175.17  LR:0.00091324  Bf: 4  EPS:0.3414  Loss: 8.584312

Ep: 1760  Rew: -154.00  Avg Rew: -157.99  LR:0.00085034  Bf: 6  EPS:0.2190  Loss: 6.586353
Ep: 1770  Rew: -190.00  Avg Rew: -157.47  LR:0.00084962  Bf: 6  EPS:0.2181  Loss: 6.793868
Ep: 1780  Rew: -161.00  Avg Rew: -158.01  LR:0.00084890  Bf: 6  EPS:0.2171  Loss: 5.402815
Ep: 1790  Rew: -153.00  Avg Rew: -157.27  LR:0.00084818  Bf: 6  EPS:0.2162  Loss: 6.518350
Ep: 1800  Rew: -157.00  Avg Rew: -156.55  LR:0.00084746  Bf: 6  EPS:0.2152  Loss: 8.021533
Ep: 1810  Rew: -148.00  Avg Rew: -157.02  LR:0.00084674  Bf: 6  EPS:0.2143  Loss: 6.805595
Ep: 1820  Rew: -156.00  Avg Rew: -157.47  LR:0.00084602  Bf: 6  EPS:0.2134  Loss: 6.630644
Ep: 1830  Rew: -126.00  Avg Rew: -156.40  LR:0.00084531  Bf: 7  EPS:0.2124  Loss: 7.526954
Ep: 1840  Rew: -163.00  Avg Rew: -154.59  LR:0.00084459  Bf: 7  EPS:0.2115  Loss: 5.872520
Ep: 1850  Rew: -165.00  Avg Rew: -155.22  LR:0.00084388  Bf: 7  EPS:0.2106  Loss: 5.366106
Ep: 1860  Rew: -151.00  Avg Rew: -155.18  LR:0.00084317  Bf: 7  EPS:0.2097  Loss: 5.957512

Ep: 2670  Rew: -148.00  Avg Rew: -151.59  LR:0.00078927  Bf: 9  EPS:0.1562  Loss: 5.872716
Ep: 2680  Rew: -155.00  Avg Rew: -151.25  LR:0.00078864  Bf: 9  EPS:0.1557  Loss: 4.924307
Ep: 2690  Rew: -146.00  Avg Rew: -152.65  LR:0.00078802  Bf: 9  EPS:0.1552  Loss: 5.524280
Ep: 2700  Rew: -155.00  Avg Rew: -152.27  LR:0.00078740  Bf: 9  EPS:0.1547  Loss: 3.944325
Ep: 2710  Rew: -155.00  Avg Rew: -152.11  LR:0.00078678  Bf: 9  EPS:0.1542  Loss: 4.875252
Ep: 2720  Rew: -119.00  Avg Rew: -151.58  LR:0.00078616  Bf: 9  EPS:0.1537  Loss: 5.160183
Ep: 2730  Rew: -153.00  Avg Rew: -153.20  LR:0.00078555  Bf: 9  EPS:0.1533  Loss: 4.930742
Ep: 2740  Rew: -158.00  Avg Rew: -153.56  LR:0.00078493  Bf: 9  EPS:0.1528  Loss: 3.133353
Ep: 2750  Rew: -151.00  Avg Rew: -154.13  LR:0.00078431  Bf: 9  EPS:0.1523  Loss: 6.565389
Ep: 2760  Rew: -154.00  Avg Rew: -154.84  LR:0.00078370  Bf: 9  EPS:0.1518  Loss: 3.480406
Ep: 2770  Rew: -200.00  Avg Rew: -152.26  LR:0.00078309  Bf: 9  EPS:0.1514  Loss: 6.595198

Ep: 3580  Rew: -145.00  Avg Rew: -154.35  LR:0.00073638  Bf:12  EPS:0.1213  Loss: 3.276126
Ep: 3590  Rew: -161.00  Avg Rew: -154.26  LR:0.00073584  Bf:12  EPS:0.1210  Loss: 4.054077
Ep: 3600  Rew: -150.00  Avg Rew: -154.62  LR:0.00073529  Bf:12  EPS:0.1207  Loss: 3.625258
Ep: 3610  Rew: -158.00  Avg Rew: -154.05  LR:0.00073475  Bf:12  EPS:0.1204  Loss: 4.859776
Ep: 3620  Rew: -143.00  Avg Rew: -153.56  LR:0.00073421  Bf:12  EPS:0.1201  Loss: 3.855587
Ep: 3630  Rew: -195.00  Avg Rew: -154.31  LR:0.00073368  Bf:12  EPS:0.1199  Loss: 3.365857
Ep: 3640  Rew: -151.00  Avg Rew: -153.19  LR:0.00073314  Bf:12  EPS:0.1196  Loss: 4.105109
Ep: 3650  Rew: -154.00  Avg Rew: -151.85  LR:0.00073260  Bf:12  EPS:0.1193  Loss: 3.990402
Ep: 3660  Rew: -110.00  Avg Rew: -150.17  LR:0.00073206  Bf:12  EPS:0.1190  Loss: 3.492190
Ep: 3670  Rew: -158.00  Avg Rew: -148.54  LR:0.00073153  Bf:12  EPS:0.1187  Loss: 4.493945
Ep: 3680  Rew: -145.00  Avg Rew: -148.54  LR:0.00073099  Bf:12  EPS:0.1184  Loss: 4.191136

Ep: 4490  Rew: -112.00  Avg Rew: -145.42  LR:0.00069013  Bf:15  EPS:0.0992  Loss: 3.824923
Ep: 4500  Rew: -134.00  Avg Rew: -145.99  LR:0.00068966  Bf:15  EPS:0.0990  Loss: 3.897460
Ep: 4510  Rew: -132.00  Avg Rew: -145.55  LR:0.00068918  Bf:15  EPS:0.0988  Loss: 2.804313
Ep: 4520  Rew: -141.00  Avg Rew: -144.07  LR:0.00068871  Bf:15  EPS:0.0986  Loss: 3.467457
Ep: 4530  Rew: -162.00  Avg Rew: -142.62  LR:0.00068823  Bf:15  EPS:0.0984  Loss: 3.054884
Ep: 4540  Rew: -120.00  Avg Rew: -140.32  LR:0.00068776  Bf:15  EPS:0.0982  Loss: 3.555777
Ep: 4550  Rew: -144.00  Avg Rew: -140.47  LR:0.00068729  Bf:15  EPS:0.0980  Loss: 3.747890
Ep: 4560  Rew: -117.00  Avg Rew: -138.42  LR:0.00068681  Bf:15  EPS:0.0978  Loss: 2.694831
Ep: 4570  Rew: -172.00  Avg Rew: -139.34  LR:0.00068634  Bf:15  EPS:0.0976  Loss: 3.401058
Ep: 4580  Rew: -144.00  Avg Rew: -139.91  LR:0.00068587  Bf:15  EPS:0.0974  Loss: 3.850725
Ep: 4590  Rew: -153.00  Avg Rew: -139.80  LR:0.00068540  Bf:15  EPS:0.0972  Loss: 3.707067

Ep: 5400  Rew: -144.00  Avg Rew: -124.83  LR:0.00064935  Bf:17  EPS:0.0839  Loss: 2.304095
Ep: 5410  Rew: -116.00  Avg Rew: -124.57  LR:0.00064893  Bf:17  EPS:0.0838  Loss: 3.479183
Ep: 5420  Rew: -111.00  Avg Rew: -123.04  LR:0.00064851  Bf:17  EPS:0.0836  Loss: 3.565473
Ep: 5430  Rew: -144.00  Avg Rew: -123.27  LR:0.00064809  Bf:17  EPS:0.0835  Loss: 2.526112
Ep: 5440  Rew: -113.00  Avg Rew: -123.22  LR:0.00064767  Bf:17  EPS:0.0833  Loss: 2.687433
Ep: 5450  Rew: -145.00  Avg Rew: -123.30  LR:0.00064725  Bf:17  EPS:0.0832  Loss: 3.525978
Ep: 5460  Rew: -144.00  Avg Rew: -122.37  LR:0.00064683  Bf:17  EPS:0.0831  Loss: 3.096577
Ep: 5470  Rew: -112.00  Avg Rew: -122.67  LR:0.00064641  Bf:17  EPS:0.0829  Loss: 2.913923
Ep: 5480  Rew: -130.00  Avg Rew: -121.84  LR:0.00064599  Bf:17  EPS:0.0828  Loss: 3.920015
Ep: 5490  Rew: -144.00  Avg Rew: -121.81  LR:0.00064558  Bf:17  EPS:0.0826  Loss: 3.479903
Ep: 5500  Rew: -142.00  Avg Rew: -121.29  LR:0.00064516  Bf:17  EPS:0.0825  Loss: 3.213952

Ep: 6310  Rew:  -92.00  Avg Rew: -128.06  LR:0.00061312  Bf:19  EPS:0.0727  Loss: 2.641495
Ep: 6320  Rew: -112.00  Avg Rew: -129.18  LR:0.00061275  Bf:19  EPS:0.0726  Loss: 2.863463
Ep: 6330  Rew: -181.00  Avg Rew: -129.05  LR:0.00061237  Bf:19  EPS:0.0725  Loss: 1.894943
Ep: 6340  Rew: -141.00  Avg Rew: -127.02  LR:0.00061200  Bf:19  EPS:0.0724  Loss: 2.437023
Ep: 6350  Rew:  -88.00  Avg Rew: -125.96  LR:0.00061162  Bf:19  EPS:0.0723  Loss: 2.487812
Ep: 6360  Rew: -146.00  Avg Rew: -126.80  LR:0.00061125  Bf:19  EPS:0.0722  Loss: 3.468451
Ep: 6370  Rew: -111.00  Avg Rew: -126.31  LR:0.00061087  Bf:19  EPS:0.0721  Loss: 1.778002
Ep: 6380  Rew: -150.00  Avg Rew: -125.21  LR:0.00061050  Bf:19  EPS:0.0719  Loss: 2.673296
Ep: 6390  Rew:  -92.00  Avg Rew: -122.07  LR:0.00061013  Bf:19  EPS:0.0718  Loss: 2.262150
Ep: 6400  Rew: -111.00  Avg Rew: -121.74  LR:0.00060976  Bf:19  EPS:0.0717  Loss: 2.291887
Ep: 6410  Rew: -107.00  Avg Rew: -122.40  LR:0.00060938  Bf:19  EPS:0.0716  Loss: 3.697669

Ep: 7220  Rew: -149.00  Avg Rew: -141.98  LR:0.00058072  Bf:22  EPS:0.0641  Loss: 2.726937
Ep: 7230  Rew: -148.00  Avg Rew: -141.91  LR:0.00058038  Bf:22  EPS:0.0640  Loss: 2.903411
Ep: 7240  Rew: -152.00  Avg Rew: -141.98  LR:0.00058005  Bf:22  EPS:0.0640  Loss: 2.252001
Ep: 7250  Rew: -151.00  Avg Rew: -142.00  LR:0.00057971  Bf:22  EPS:0.0639  Loss: 2.348191
Ep: 7260  Rew: -154.00  Avg Rew: -143.08  LR:0.00057937  Bf:22  EPS:0.0638  Loss: 2.688598
Ep: 7270  Rew: -145.00  Avg Rew: -143.26  LR:0.00057904  Bf:22  EPS:0.0637  Loss: 2.703109
Ep: 7280  Rew: -149.00  Avg Rew: -142.83  LR:0.00057870  Bf:22  EPS:0.0636  Loss: 2.917562
Ep: 7290  Rew: -145.00  Avg Rew: -142.88  LR:0.00057837  Bf:22  EPS:0.0635  Loss: 3.081082
Ep: 7300  Rew: -164.00  Avg Rew: -143.75  LR:0.00057803  Bf:22  EPS:0.0635  Loss: 2.837272
Ep: 7310  Rew: -146.00  Avg Rew: -144.37  LR:0.00057770  Bf:22  EPS:0.0634  Loss: 2.268869
Ep: 7320  Rew: -169.00  Avg Rew: -146.38  LR:0.00057737  Bf:22  EPS:0.0633  Loss: 3.234771

Ep: 8130  Rew: -146.00  Avg Rew: -121.83  LR:0.00055157  Bf:24  EPS:0.0574  Loss: 2.091973
Ep: 8140  Rew: -108.00  Avg Rew: -123.80  LR:0.00055127  Bf:24  EPS:0.0573  Loss: 2.943523
Ep: 8150  Rew: -168.00  Avg Rew: -125.14  LR:0.00055096  Bf:24  EPS:0.0572  Loss: 2.916187
Ep: 8160  Rew: -110.00  Avg Rew: -125.44  LR:0.00055066  Bf:24  EPS:0.0572  Loss: 3.045129
Ep: 8170  Rew:  -93.00  Avg Rew: -125.70  LR:0.00055036  Bf:24  EPS:0.0571  Loss: 1.966680
Ep: 8180  Rew: -186.00  Avg Rew: -126.50  LR:0.00055006  Bf:24  EPS:0.0570  Loss: 3.057090
Ep: 8190  Rew: -110.00  Avg Rew: -126.19  LR:0.00054975  Bf:24  EPS:0.0570  Loss: 2.321854
Ep: 8200  Rew: -121.00  Avg Rew: -126.15  LR:0.00054945  Bf:24  EPS:0.0569  Loss: 1.788127
Ep: 8210  Rew: -130.00  Avg Rew: -125.58  LR:0.00054915  Bf:24  EPS:0.0568  Loss: 1.832494
Ep: 8220  Rew: -109.00  Avg Rew: -125.96  LR:0.00054885  Bf:24  EPS:0.0568  Loss: 1.817287
Ep: 8230  Rew: -123.00  Avg Rew: -126.28  LR:0.00054855  Bf:24  EPS:0.0567  Loss: 2.505961

Ep: 9040  Rew: -147.00  Avg Rew: -137.58  LR:0.00052521  Bf:26  EPS:0.0519  Loss: 1.715040
Ep: 9050  Rew: -146.00  Avg Rew: -138.04  LR:0.00052493  Bf:26  EPS:0.0518  Loss: 2.446067
Ep: 9060  Rew:  -89.00  Avg Rew: -138.34  LR:0.00052466  Bf:26  EPS:0.0518  Loss: 1.999254
Ep: 9070  Rew: -145.00  Avg Rew: -138.56  LR:0.00052438  Bf:26  EPS:0.0517  Loss: 2.501822
Ep: 9080  Rew: -193.00  Avg Rew: -139.64  LR:0.00052411  Bf:26  EPS:0.0517  Loss: 1.866860
Ep: 9090  Rew:  -92.00  Avg Rew: -137.52  LR:0.00052383  Bf:26  EPS:0.0516  Loss: 2.266339
Ep: 9100  Rew: -101.00  Avg Rew: -136.27  LR:0.00052356  Bf:26  EPS:0.0516  Loss: 1.843390
Ep: 9110  Rew: -167.00  Avg Rew: -135.37  LR:0.00052329  Bf:26  EPS:0.0515  Loss: 2.122141
Ep: 9120  Rew: -178.00  Avg Rew: -133.64  LR:0.00052301  Bf:27  EPS:0.0515  Loss: 1.909483
Ep: 9130  Rew: -145.00  Avg Rew: -133.19  LR:0.00052274  Bf:27  EPS:0.0514  Loss: 1.673376
Ep: 9140  Rew:  -85.00  Avg Rew: -133.09  LR:0.00052247  Bf:27  EPS:0.0513  Loss: 2.641681

Ep: 9950  Rew:  -98.00  Avg Rew: -120.87  LR:0.00050125  Bf:29  EPS:0.0474  Loss: 2.170360
Ep: 9960  Rew: -110.00  Avg Rew: -120.80  LR:0.00050100  Bf:29  EPS:0.0473  Loss: 1.954758
Ep: 9970  Rew: -106.00  Avg Rew: -121.59  LR:0.00050075  Bf:29  EPS:0.0473  Loss: 1.642880
Ep: 9980  Rew:  -92.00  Avg Rew: -120.29  LR:0.00050050  Bf:29  EPS:0.0472  Loss: 1.700970
Ep: 9990  Rew: -107.00  Avg Rew: -119.19  LR:0.00050025  Bf:29  EPS:0.0472  Loss: 1.789611
Ep:10000  Rew: -108.00  Avg Rew: -118.84  LR:0.00050000  Bf:29  EPS:0.0471  Loss: 2.396089
Ep:10010  Rew:  -86.00  Avg Rew: -116.17  LR:0.00049975  Bf:29  EPS:0.0471  Loss: 1.930881
Ep:10020  Rew:  -92.00  Avg Rew: -116.75  LR:0.00049950  Bf:29  EPS:0.0471  Loss: 1.820157
Ep:10030  Rew:  -93.00  Avg Rew: -115.11  LR:0.00049925  Bf:29  EPS:0.0470  Loss: 1.938435
Ep:10040  Rew: -106.00  Avg Rew: -115.58  LR:0.00049900  Bf:29  EPS:0.0470  Loss: 1.845168
Ep:10050  Rew: -149.00  Avg Rew: -114.73  LR:0.00049875  Bf:29  EPS:0.0469  Loss: 1.335570

Ep:10860  Rew:  -85.00  Avg Rew: -110.22  LR:0.00047939  Bf:31  EPS:0.0436  Loss: 1.201638
Ep:10870  Rew: -189.00  Avg Rew: -112.31  LR:0.00047916  Bf:31  EPS:0.0435  Loss: 2.163024
Ep:10880  Rew: -107.00  Avg Rew: -111.43  LR:0.00047893  Bf:31  EPS:0.0435  Loss: 2.021811
Ep:10890  Rew: -106.00  Avg Rew: -110.60  LR:0.00047870  Bf:31  EPS:0.0435  Loss: 2.241033
Ep:10900  Rew: -190.00  Avg Rew: -112.35  LR:0.00047847  Bf:31  EPS:0.0434  Loss: 1.416258
Ep:10910  Rew: -108.00  Avg Rew: -111.94  LR:0.00047824  Bf:31  EPS:0.0434  Loss: 3.105210
Ep:10920  Rew: -146.00  Avg Rew: -113.14  LR:0.00047801  Bf:31  EPS:0.0433  Loss: 2.827832
Ep:10930  Rew: -110.00  Avg Rew: -113.91  LR:0.00047778  Bf:31  EPS:0.0433  Loss: 1.731578
Ep:10940  Rew: -109.00  Avg Rew: -113.35  LR:0.00047755  Bf:31  EPS:0.0433  Loss: 2.589072
Ep:10950  Rew: -110.00  Avg Rew: -113.34  LR:0.00047733  Bf:31  EPS:0.0432  Loss: 2.040430
Ep:10960  Rew: -107.00  Avg Rew: -114.25  LR:0.00047710  Bf:31  EPS:0.0432  Loss: 1.236377

In [6]:
agent.test()

Test episode: 1	Reward: -105.00
Test episode: 2	Reward: -103.00
Test episode: 3	Reward: -104.00
