In [46]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
import random

In [18]:
env = gym.make('CartPole-v1')

num_actions = env.action_space.n
state_space_dimensions = env.observation_space.shape[0]

print('Actions: {} -- State space dimensions: {}'.format(num_actions, state_space_dimensions))

Actions: 2 -- State space dimensions: 4




In [163]:
class CartPoleDQN(nn.Module):
    
    def __init__(self):
        super(CartPoleDQN, self).__init__()
        self.fc1 = nn.Linear(state_space_dimensions, 256)
        self.fc2 = nn.Linear(256, 64)
        #self.fc3 = nn.Linear(128, 64)
        self.head = nn.Linear(64, num_actions)
    
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        #x = F.leaky_relu(self.fc3(x))
        x = F.leaky_relu(self.head(x))
        return x

In [213]:
class ReplayBuffer():
    
    def __init__(self, capacity):
        self.capacity = capacity
        self.position = 0
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.done = []
        
        
    def push(self, state, action, reward, next_state, done):
        if len(self.states) < self.capacity:
            self.states.append(None)
            self.actions.append(None)
            self.rewards.append(None)
            self.next_states.append(None)
            self.done.append(None)
            
        self.states[self.position] = state
        self.actions[self.position] = action
        self.rewards[self.position] = reward
        self.next_states[self.position] = next_state
        self.done[self.position] = done
        
        self.position = (self.position + 1) % self.capacity
        
        
    def sample(self, batch_size):
        indices = np.random.choice(range(len(self.states)), size=batch_size)
        state_sample = [self.states[i] for i in indices]
        action_sample = [self.actions[i] for i in indices]
        reward_sample = [self.rewards[i] for i in indices]
        next_state_sample = [self.next_states[i] for i in indices]
        done_sample = [self.done[i] for i in indices]
        
        return state_sample, action_sample, reward_sample, next_state_sample, done_sample
        
    def __len__(self):
        return len(self.states)

In [214]:
def select_action(state, env, model, epsilon):
    if random.random() > epsilon:
        with torch.no_grad():
            return model(state).argmax().item()
    else:
        return env.action_space.sample()
    

In [215]:
def update_epsilon(epsilon_start, epsilon_end, epsilon_steps, total_steps):
    return epsilon_end + (epsilon_start - epsilon_end) * math.exp(-1. * total_steps / epsilon_steps)  
    

In [239]:
def optimize_model(policy_net, target_net, optimizer, memory, batch_size, gamma):
    state_batch, action_batch, reward_batch, next_state_batch, done_batch = memory.sample(batch_size)
        
    state_batch = torch.tensor(state_batch).float().view((batch_size, -1))
    action_batch = torch.tensor(action_batch, dtype=torch.int64).view((batch_size))
    reward_batch = torch.tensor(reward_batch).float().view((batch_size))
    
    non_final_next_states = torch.tensor([s for s in next_state_batch if s is not None]).float()
    non_final_mask = torch.tensor(list(map(lambda s: s is not None, next_state_batch)), dtype=torch.bool)
        
    state_action_values = policy_net(state_batch)
    state_action_values = state_action_values.gather(1, action_batch.reshape((batch_size, 1)))
    
    next_state_values = torch.zeros(batch_size)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(dim=1)[0].float().detach()
        
    expected_state_action_values = reward_batch + gamma * next_state_values
    
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss
    
    

def train_dqn(env, policy_net, target_net, optimizer, memory, target_update=10, batch_size=32, episodes=100, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_steps=1000000):
    
    total_rewards = []
    total_steps = 0
    
    epsilon = epsilon_start
        
    for episode in range(episodes):
        
        done = False
        state = env.reset()
        
        total_rewards.append(0)
        loss = 0
        
        while not done:
            
            env.render()
            state_tensor = torch.tensor(state).float()
            action = select_action(state_tensor, env, policy_net, epsilon)
            
            next_state, reward, done, _ = env.step(action)
            
            if done:
                next_state = None
            
            total_rewards[episode] += reward
            
            
            memory.push(state, action, reward, next_state, done)
            
            
            state = next_state
            
            if len(memory) >= batch_size:                 
                loss = optimize_model(policy_net, target_net, optimizer, memory, batch_size, gamma)
        
            if total_steps % target_update == 0:
                target_net.load_state_dict(policy_net.state_dict())
                
            total_steps += 1
            epsilon = update_epsilon(epsilon_start, epsilon_end, epsilon_steps, total_steps)
            
        
        print('{}/{} Total steps: {} Episode reward: {} Average reward: {} Loss: {} Epsilon: {}'.format(episode, episodes, total_steps, total_rewards[episode], np.mean(total_rewards), loss, epsilon))

            
                   

In [242]:
env = gym.make('CartPole-v1')

num_actions = env.action_space.n
state_space_dimensions = env.observation_space.shape[0]

target_net = CartPoleDQN()
policy_net = CartPoleDQN()

#policy_net.load_state_dict(target_net.state_dict())

optimizer = torch.optim.Adam(policy_net.parameters())

memory = ReplayBuffer(100000)
try:
    train_dqn(env, policy_net, target_net, optimizer, memory, gamma=0.5, batch_size=128, episodes=1000, epsilon_steps=10000, epsilon_end=0.1)
finally:
    env.close()

0/1000 Total steps: 20 Episode reward: 20.0 Average reward: 20.0 Loss: 0 Epsilon: 0.8984015989338665
1/1000 Total steps: 31 Episode reward: 11.0 Average reward: 15.5 Loss: 0 Epsilon: 0.8975238400309432
2/1000 Total steps: 47 Episode reward: 16.0 Average reward: 15.666666666666666 Loss: 0 Epsilon: 0.8962488221731837
3/1000 Total steps: 69 Episode reward: 22.0 Average reward: 17.25 Loss: 0 Epsilon: 0.894499000274253
4/1000 Total steps: 101 Episode reward: 32.0 Average reward: 20.2 Loss: 0 Epsilon: 0.8919606669727019
5/1000 Total steps: 120 Episode reward: 19.0 Average reward: 20.0 Loss: 0 Epsilon: 0.8904573702895444
6/1000 Total steps: 142 Episode reward: 22.0 Average reward: 20.285714285714285 Loss: 0.5645555853843689 Epsilon: 0.8887202755797164
7/1000 Total steps: 162 Episode reward: 20.0 Average reward: 20.25 Loss: 0.10816188156604767 Epsilon: 0.8871444114180067
8/1000 Total steps: 210 Episode reward: 48.0 Average reward: 23.333333333333332 Loss: 0.029264872893691063 Epsilon: 0.883375

63/1000 Total steps: 1600 Episode reward: 27.0 Average reward: 25.0 Loss: 0.004694642033427954 Epsilon: 0.7817150311729691
64/1000 Total steps: 1626 Episode reward: 26.0 Average reward: 25.015384615384615 Loss: 0.004895329475402832 Epsilon: 0.7799448742930516
65/1000 Total steps: 1648 Episode reward: 22.0 Average reward: 24.96969696969697 Loss: 0.000617132696788758 Epsilon: 0.7784506398301905
66/1000 Total steps: 1706 Episode reward: 58.0 Average reward: 25.46268656716418 Loss: 0.002474992536008358 Epsilon: 0.7745270156285804
67/1000 Total steps: 1734 Episode reward: 28.0 Average reward: 25.5 Loss: 0.0018141667824238539 Epsilon: 0.7726409816645787
68/1000 Total steps: 1772 Episode reward: 38.0 Average reward: 25.681159420289855 Loss: 0.0019884021021425724 Epsilon: 0.7700897962564545
69/1000 Total steps: 1793 Episode reward: 21.0 Average reward: 25.614285714285714 Loss: 0.0018971215467900038 Epsilon: 0.7686840841985758
70/1000 Total steps: 1839 Episode reward: 46.0 Average reward: 25.90

124/1000 Total steps: 3227 Episode reward: 20.0 Average reward: 25.816 Loss: 0.0023483755066990852 Epsilon: 0.6793528632850458
125/1000 Total steps: 3263 Episode reward: 36.0 Average reward: 25.896825396825395 Loss: 0.0016700892010703683 Epsilon: 0.6772709426827774
126/1000 Total steps: 3304 Episode reward: 41.0 Average reward: 26.015748031496063 Loss: 0.0028012029360979795 Epsilon: 0.6749089771558274
127/1000 Total steps: 3316 Episode reward: 12.0 Average reward: 25.90625 Loss: 0.0012844223529100418 Epsilon: 0.6742195001521799
128/1000 Total steps: 3371 Episode reward: 55.0 Average reward: 26.131782945736433 Loss: 0.0015378404641523957 Epsilon: 0.671069962070524
129/1000 Total steps: 3401 Episode reward: 30.0 Average reward: 26.161538461538463 Loss: 0.001965863164514303 Epsilon: 0.6693593194312532
130/1000 Total steps: 3427 Episode reward: 26.0 Average reward: 26.16030534351145 Loss: 0.0008754581795074046 Epsilon: 0.6678809079684719
131/1000 Total steps: 3449 Episode reward: 22.0 Aver

184/1000 Total steps: 5167 Episode reward: 28.0 Average reward: 27.92972972972973 Loss: 0.00249171513132751 Epsilon: 0.5771885652053602
185/1000 Total steps: 5184 Episode reward: 17.0 Average reward: 27.870967741935484 Loss: 0.0005824746331200004 Epsilon: 0.5763780337914158
186/1000 Total steps: 5224 Episode reward: 40.0 Average reward: 27.93582887700535 Loss: 0.0018738191574811935 Epsilon: 0.5744763276042322
187/1000 Total steps: 5282 Episode reward: 58.0 Average reward: 28.095744680851062 Loss: 0.0004663001745939255 Epsilon: 0.571732330188967
188/1000 Total steps: 5295 Episode reward: 13.0 Average reward: 28.015873015873016 Loss: 0.002071532653644681 Epsilon: 0.5711194766008638
189/1000 Total steps: 5366 Episode reward: 71.0 Average reward: 28.242105263157896 Loss: 0.0011119355913251638 Epsilon: 0.5677863748300772
190/1000 Total steps: 5380 Episode reward: 14.0 Average reward: 28.167539267015705 Loss: 0.00139649398624897 Epsilon: 0.567131932122103
191/1000 Total steps: 5393 Episode r

244/1000 Total steps: 6999 Episode reward: 62.0 Average reward: 28.56734693877551 Loss: 0.0016017527086660266 Epsilon: 0.49730797184383835
245/1000 Total steps: 7017 Episode reward: 18.0 Average reward: 28.524390243902438 Loss: 0.00048610856174491346 Epsilon: 0.49659346074742416
246/1000 Total steps: 7055 Episode reward: 38.0 Average reward: 28.562753036437247 Loss: 0.0006822230643592775 Epsilon: 0.49508926537783415
247/1000 Total steps: 7099 Episode reward: 44.0 Average reward: 28.625 Loss: 0.0005923499120399356 Epsilon: 0.49335469147121136
248/1000 Total steps: 7160 Episode reward: 61.0 Average reward: 28.755020080321284 Loss: 0.0009406101889908314 Epsilon: 0.4909625313592636
249/1000 Total steps: 7208 Episode reward: 48.0 Average reward: 28.832 Loss: 0.001293109729886055 Epsilon: 0.4890904078995182
250/1000 Total steps: 7280 Episode reward: 72.0 Average reward: 29.00398406374502 Loss: 0.0011418645735830069 Epsilon: 0.4862990180249839
251/1000 Total steps: 7310 Episode reward: 30.0 A

304/1000 Total steps: 9415 Episode reward: 53.0 Average reward: 30.868852459016395 Loss: 0.0017822221852838993 Epsilon: 0.41203386627372185
305/1000 Total steps: 9463 Episode reward: 48.0 Average reward: 30.924836601307188 Loss: 0.0006318922969512641 Epsilon: 0.41053969260123435
306/1000 Total steps: 9547 Episode reward: 84.0 Average reward: 31.09771986970684 Loss: 0.00048242646153084934 Epsilon: 0.4079420844116982
307/1000 Total steps: 9584 Episode reward: 37.0 Average reward: 31.116883116883116 Loss: 0.0019952706061303616 Epsilon: 0.4068048039656472
308/1000 Total steps: 9615 Episode reward: 31.0 Average reward: 31.116504854368934 Loss: 0.0009133482235483825 Epsilon: 0.4058551817482797
309/1000 Total steps: 9679 Episode reward: 64.0 Average reward: 31.22258064516129 Loss: 0.0004046832618769258 Epsilon: 0.4039039591575496
310/1000 Total steps: 9793 Episode reward: 114.0 Average reward: 31.488745980707396 Loss: 0.0003094020066782832 Epsilon: 0.40045912687461904
311/1000 Total steps: 98

363/1000 Total steps: 12342 Episode reward: 57.0 Average reward: 33.90659340659341 Loss: 0.0006486999918706715 Epsilon: 0.33285401861575414
364/1000 Total steps: 12379 Episode reward: 37.0 Average reward: 33.915068493150685 Loss: 0.0020903670229017735 Epsilon: 0.3319940506686579
365/1000 Total steps: 12418 Episode reward: 39.0 Average reward: 33.92896174863388 Loss: 0.0009311420726589859 Epsilon: 0.3310910358944308
366/1000 Total steps: 12431 Episode reward: 13.0 Average reward: 33.87193460490463 Loss: 0.00037622801028192043 Epsilon: 0.330790812735103
367/1000 Total steps: 12575 Episode reward: 144.0 Average reward: 34.171195652173914 Loss: 0.0003499385202303529 Epsilon: 0.3274912389791975
368/1000 Total steps: 12637 Episode reward: 62.0 Average reward: 34.24661246612466 Loss: 0.0010561095550656319 Epsilon: 0.3260851566568732
369/1000 Total steps: 12652 Episode reward: 15.0 Average reward: 34.1945945945946 Loss: 0.000699564116075635 Epsilon: 0.32574628314056386
370/1000 Total steps: 12

422/1000 Total steps: 18581 Episode reward: 402.0 Average reward: 43.92671394799054 Loss: 0.0009712692117318511 Epsilon: 0.22477495162627092
423/1000 Total steps: 18649 Episode reward: 68.0 Average reward: 43.98349056603774 Loss: 0.0009271241724491119 Epsilon: 0.22392936022432194
424/1000 Total steps: 18740 Episode reward: 91.0 Average reward: 44.09411764705882 Loss: 0.0009259525104425848 Epsilon: 0.22280671881185787
425/1000 Total steps: 18917 Episode reward: 177.0 Average reward: 44.406103286384976 Loss: 0.0003462258609943092 Epsilon: 0.2206521639491702
426/1000 Total steps: 19075 Episode reward: 158.0 Average reward: 44.67213114754098 Loss: 0.0002151244698325172 Epsilon: 0.2187608405592211
427/1000 Total steps: 19126 Episode reward: 51.0 Average reward: 44.68691588785047 Loss: 0.005123074632138014 Epsilon: 0.21815670213482075
428/1000 Total steps: 19219 Episode reward: 93.0 Average reward: 44.7995337995338 Loss: 0.0006381546263583004 Epsilon: 0.21706293868828203
429/1000 Total steps

481/1000 Total steps: 24791 Episode reward: 282.0 Average reward: 51.433609958506224 Loss: 0.00039252976421266794 Epsilon: 0.16705490273712992
482/1000 Total steps: 24874 Episode reward: 83.0 Average reward: 51.49896480331263 Loss: 0.0033228881657123566 Epsilon: 0.16650065037358724
483/1000 Total steps: 25082 Episode reward: 208.0 Average reward: 51.82231404958678 Loss: 0.0002736929163802415 Epsilon: 0.1651317230440809
484/1000 Total steps: 25177 Episode reward: 95.0 Average reward: 51.911340206185564 Loss: 0.00025438674492761493 Epsilon: 0.164515901459175
485/1000 Total steps: 25315 Episode reward: 138.0 Average reward: 52.08847736625514 Loss: 0.0012416699901223183 Epsilon: 0.16363169706166047
486/1000 Total steps: 25491 Episode reward: 176.0 Average reward: 52.3429158110883 Loss: 0.003005422418937087 Epsilon: 0.16252157690649438
487/1000 Total steps: 25754 Episode reward: 263.0 Average reward: 52.77459016393443 Loss: 0.0002771475410554558 Epsilon: 0.16089869388878758
488/1000 Total s

540/1000 Total steps: 31938 Episode reward: 117.0 Average reward: 59.03512014787431 Loss: 0.0010837833397090435 Epsilon: 0.13281257177138753
541/1000 Total steps: 32085 Episode reward: 147.0 Average reward: 59.19741697416974 Loss: 0.00028229167219251394 Epsilon: 0.1323337548926706
542/1000 Total steps: 32150 Episode reward: 65.0 Average reward: 59.20810313075506 Loss: 0.00044968092697672546 Epsilon: 0.13212426705889924
543/1000 Total steps: 32263 Episode reward: 113.0 Average reward: 59.306985294117645 Loss: 0.0016060794005170465 Epsilon: 0.1317633061114041
544/1000 Total steps: 32416 Episode reward: 153.0 Average reward: 59.47889908256881 Loss: 0.0009706945857033134 Epsilon: 0.13128102637591138
545/1000 Total steps: 32540 Episode reward: 124.0 Average reward: 59.5970695970696 Loss: 0.0007499767816625535 Epsilon: 0.1308955366247036
546/1000 Total steps: 32665 Episode reward: 125.0 Average reward: 59.716636197440586 Loss: 0.00031287013553082943 Epsilon: 0.13051174610490293
547/1000 Tota

599/1000 Total steps: 38739 Episode reward: 23.0 Average reward: 64.565 Loss: 0.0025192461907863617 Epsilon: 0.1166217441650724
600/1000 Total steps: 38840 Episode reward: 101.0 Average reward: 64.62562396006656 Loss: 0.0004016577731817961 Epsilon: 0.11645470949402545
601/1000 Total steps: 39056 Episode reward: 216.0 Average reward: 64.87707641196013 Loss: 0.00027495596441440284 Epsilon: 0.11610309883459251
602/1000 Total steps: 39170 Episode reward: 114.0 Average reward: 64.95854063018243 Loss: 0.0007930509746074677 Epsilon: 0.11592056592230535
603/1000 Total steps: 39221 Episode reward: 51.0 Average reward: 64.93543046357615 Loss: 0.0010893573053181171 Epsilon: 0.11583957773152989
604/1000 Total steps: 39244 Episode reward: 23.0 Average reward: 64.86611570247933 Loss: 0.0038278468418866396 Epsilon: 0.11580318856632892
605/1000 Total steps: 39422 Episode reward: 178.0 Average reward: 65.05280528052805 Loss: 0.0002587757771834731 Epsilon: 0.11552438056250408
606/1000 Total steps: 39542

658/1000 Total steps: 44472 Episode reward: 108.0 Average reward: 67.48406676783004 Loss: 2.723698526096996e-05 Epsilon: 0.1093690502245224
659/1000 Total steps: 44495 Episode reward: 23.0 Average reward: 67.41666666666667 Loss: 0.00052096345461905 Epsilon: 0.1093475261711559
660/1000 Total steps: 44598 Episode reward: 103.0 Average reward: 67.47049924357034 Loss: 0.0002165471960324794 Epsilon: 0.10925174079311097
661/1000 Total steps: 44801 Episode reward: 203.0 Average reward: 67.67522658610272 Loss: 0.0001536139752715826 Epsilon: 0.10906582389601369
662/1000 Total steps: 45036 Episode reward: 235.0 Average reward: 67.92760180995475 Loss: 0.00032657990232110023 Epsilon: 0.10885526084055708
663/1000 Total steps: 45150 Episode reward: 114.0 Average reward: 67.99698795180723 Loss: 0.0026180578861385584 Epsilon: 0.10875488410146528
664/1000 Total steps: 45281 Episode reward: 131.0 Average reward: 68.09172932330827 Loss: 0.0006092258845455945 Epsilon: 0.10864094306298526
665/1000 Total st

717/1000 Total steps: 52429 Episode reward: 36.0 Average reward: 73.02089136490251 Loss: 0.0021011405624449253 Epsilon: 0.10422792668567786
718/1000 Total steps: 52490 Episode reward: 61.0 Average reward: 73.00417246175243 Loss: 0.0004335020203143358 Epsilon: 0.10420221483377166
719/1000 Total steps: 52620 Episode reward: 130.0 Average reward: 73.08333333333333 Loss: 0.0014096556697040796 Epsilon: 0.10414793959436293
720/1000 Total steps: 52700 Episode reward: 80.0 Average reward: 73.09292649098474 Loss: 0.0014929927419871092 Epsilon: 0.10411488845842432
721/1000 Total steps: 52806 Episode reward: 106.0 Average reward: 73.13850415512465 Loss: 0.0012001359136775136 Epsilon: 0.10407150100054226
722/1000 Total steps: 52878 Episode reward: 72.0 Average reward: 73.13692946058092 Loss: 0.00019185642304364592 Epsilon: 0.1040422914738196
723/1000 Total steps: 52957 Episode reward: 79.0 Average reward: 73.14502762430939 Loss: 0.00024832403869368136 Epsilon: 0.10401048317936898
724/1000 Total st

776/1000 Total steps: 58046 Episode reward: 200.0 Average reward: 74.7052767052767 Loss: 0.00031938086613081396 Epsilon: 0.10241092798081418
777/1000 Total steps: 58118 Episode reward: 72.0 Average reward: 74.70179948586119 Loss: 0.00012918793072458357 Epsilon: 0.10239363164089614
778/1000 Total steps: 58248 Episode reward: 130.0 Average reward: 74.77278562259306 Loss: 0.004722017329186201 Epsilon: 0.10236271581781116
779/1000 Total steps: 58286 Episode reward: 38.0 Average reward: 74.72564102564102 Loss: 0.0004476332396734506 Epsilon: 0.10235375453492437
780/1000 Total steps: 58428 Episode reward: 142.0 Average reward: 74.81177976952625 Loss: 0.00029100250685587525 Epsilon: 0.1023205674067907
781/1000 Total steps: 58524 Episode reward: 96.0 Average reward: 74.8388746803069 Loss: 9.695022163214162e-05 Epsilon: 0.10229839655006968
782/1000 Total steps: 58631 Episode reward: 107.0 Average reward: 74.87994891443168 Loss: 0.00014608909259550273 Epsilon: 0.10227393481067464
783/1000 Total s

835/1000 Total steps: 64808 Episode reward: 94.0 Average reward: 77.52153110047847 Loss: 0.0013710581697523594 Epsilon: 0.10122606729717566
836/1000 Total steps: 64988 Episode reward: 180.0 Average reward: 77.64396654719235 Loss: 0.00041708568460308015 Epsilon: 0.1012041955223348
837/1000 Total steps: 65175 Episode reward: 187.0 Average reward: 77.77446300715991 Loss: 0.0005537440301850438 Epsilon: 0.10118188630733273
838/1000 Total steps: 65304 Episode reward: 129.0 Average reward: 77.83551847437425 Loss: 0.00039102911250665784 Epsilon: 0.10116673789132148
839/1000 Total steps: 65437 Episode reward: 133.0 Average reward: 77.90119047619048 Loss: 0.00017784991359803826 Epsilon: 0.10115132301353169
840/1000 Total steps: 65627 Episode reward: 190.0 Average reward: 78.03448275862068 Loss: 0.0016989992000162601 Epsilon: 0.10112965438015249
841/1000 Total steps: 65687 Episode reward: 60.0 Average reward: 78.01306413301663 Loss: 5.950188642600551e-05 Epsilon: 0.10112289674704379
842/1000 Tota

894/1000 Total steps: 71944 Episode reward: 101.0 Average reward: 80.38435754189945 Loss: 8.352266740985215e-05 Epsilon: 0.10060062273380141
895/1000 Total steps: 71957 Episode reward: 13.0 Average reward: 80.30915178571429 Loss: 0.0010943150846287608 Epsilon: 0.10059984243155383
896/1000 Total steps: 71981 Episode reward: 24.0 Average reward: 80.2463768115942 Loss: 0.001581057091243565 Epsilon: 0.10059840453588309
897/1000 Total steps: 72081 Episode reward: 100.0 Average reward: 80.2683741648107 Loss: 0.00045182203757576644 Epsilon: 0.1005924503112658
898/1000 Total steps: 72171 Episode reward: 90.0 Average reward: 80.27919911012236 Loss: 0.0011286602821201086 Epsilon: 0.10058714218088097
899/1000 Total steps: 72278 Episode reward: 107.0 Average reward: 80.30888888888889 Loss: 0.0004147194267716259 Epsilon: 0.10058089325094062
900/1000 Total steps: 72505 Episode reward: 227.0 Average reward: 80.47169811320755 Loss: 0.0002530404017306864 Epsilon: 0.10056785551232411
901/1000 Total step

953/1000 Total steps: 79085 Episode reward: 276.0 Average reward: 82.89832285115304 Loss: 0.0006195589085109532 Epsilon: 0.10029408446048715
954/1000 Total steps: 79158 Episode reward: 73.0 Average reward: 82.88795811518325 Loss: 0.00031499340548180044 Epsilon: 0.10029194546077348
955/1000 Total steps: 79358 Episode reward: 200.0 Average reward: 83.01046025104603 Loss: 0.0012238650815561414 Epsilon: 0.1002861645533281
956/1000 Total steps: 79463 Episode reward: 105.0 Average reward: 83.03343782654127 Loss: 0.0002559675194788724 Epsilon: 0.10028317554527191
957/1000 Total steps: 79562 Episode reward: 99.0 Average reward: 83.05010438413362 Loss: 0.00023515280918218195 Epsilon: 0.10028038593871028
958/1000 Total steps: 79706 Episode reward: 144.0 Average reward: 83.11366006256517 Loss: 0.0007745113107375801 Epsilon: 0.10027637731256989
959/1000 Total steps: 79790 Episode reward: 84.0 Average reward: 83.11458333333333 Loss: 0.00031520123593509197 Epsilon: 0.10027406546649147
960/1000 Total